diff --git a/_C.abi3.so b/vllm/_C.abi3.so similarity index 100% rename from _C.abi3.so rename to vllm/_C.abi3.so diff --git a/__init__.py b/vllm/__init__.py similarity index 100% rename from __init__.py rename to vllm/__init__.py diff --git a/_custom_ops.py b/vllm/_custom_ops.py similarity index 100% rename from _custom_ops.py rename to vllm/_custom_ops.py diff --git a/_ipex_ops.py b/vllm/_ipex_ops.py similarity index 100% rename from _ipex_ops.py rename to vllm/_ipex_ops.py diff --git a/_moe_C.abi3.so b/vllm/_moe_C.abi3.so similarity index 100% rename from _moe_C.abi3.so rename to vllm/_moe_C.abi3.so diff --git a/_release_info.txt b/vllm/_release_info.txt similarity index 100% rename from _release_info.txt rename to vllm/_release_info.txt diff --git a/adapter_commons/__init__.py b/vllm/adapter_commons/__init__.py similarity index 100% rename from adapter_commons/__init__.py rename to vllm/adapter_commons/__init__.py diff --git a/adapter_commons/layers.py b/vllm/adapter_commons/layers.py similarity index 100% rename from adapter_commons/layers.py rename to vllm/adapter_commons/layers.py diff --git a/adapter_commons/models.py b/vllm/adapter_commons/models.py similarity index 100% rename from adapter_commons/models.py rename to vllm/adapter_commons/models.py diff --git a/adapter_commons/request.py b/vllm/adapter_commons/request.py similarity index 100% rename from adapter_commons/request.py rename to vllm/adapter_commons/request.py diff --git a/adapter_commons/utils.py b/vllm/adapter_commons/utils.py similarity index 100% rename from adapter_commons/utils.py rename to vllm/adapter_commons/utils.py diff --git a/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py similarity index 100% rename from adapter_commons/worker_manager.py rename to vllm/adapter_commons/worker_manager.py diff --git a/assets/__init__.py b/vllm/assets/__init__.py similarity index 100% rename from assets/__init__.py rename to vllm/assets/__init__.py diff --git a/assets/audio.py b/vllm/assets/audio.py similarity index 100% rename from assets/audio.py rename to vllm/assets/audio.py diff --git a/assets/base.py b/vllm/assets/base.py similarity index 100% rename from assets/base.py rename to vllm/assets/base.py diff --git a/assets/image.py b/vllm/assets/image.py similarity index 100% rename from assets/image.py rename to vllm/assets/image.py diff --git a/assets/video.py b/vllm/assets/video.py similarity index 100% rename from assets/video.py rename to vllm/assets/video.py diff --git a/attention/__init__.py b/vllm/attention/__init__.py similarity index 100% rename from attention/__init__.py rename to vllm/attention/__init__.py diff --git a/attention/backends/__init__.py b/vllm/attention/backends/__init__.py similarity index 100% rename from attention/backends/__init__.py rename to vllm/attention/backends/__init__.py diff --git a/attention/backends/abstract.py b/vllm/attention/backends/abstract.py similarity index 100% rename from attention/backends/abstract.py rename to vllm/attention/backends/abstract.py diff --git a/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py similarity index 100% rename from attention/backends/blocksparse_attn.py rename to vllm/attention/backends/blocksparse_attn.py diff --git a/attention/backends/configs/tp8_merge.json b/vllm/attention/backends/configs/tp8_merge.json similarity index 100% rename from attention/backends/configs/tp8_merge.json rename to vllm/attention/backends/configs/tp8_merge.json diff --git a/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py similarity index 100% rename from attention/backends/cpu_mla.py rename to vllm/attention/backends/cpu_mla.py diff --git a/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py similarity index 100% rename from attention/backends/dual_chunk_flash_attn.py rename to vllm/attention/backends/dual_chunk_flash_attn.py diff --git a/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py similarity index 100% rename from attention/backends/flash_attn.py rename to vllm/attention/backends/flash_attn.py diff --git a/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py similarity index 100% rename from attention/backends/flashinfer.py rename to vllm/attention/backends/flashinfer.py diff --git a/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py similarity index 100% rename from attention/backends/flashmla.py rename to vllm/attention/backends/flashmla.py diff --git a/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py similarity index 100% rename from attention/backends/hpu_attn.py rename to vllm/attention/backends/hpu_attn.py diff --git a/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py similarity index 100% rename from attention/backends/ipex_attn.py rename to vllm/attention/backends/ipex_attn.py diff --git a/attention/backends/mla/__init__.py b/vllm/attention/backends/mla/__init__.py similarity index 100% rename from attention/backends/mla/__init__.py rename to vllm/attention/backends/mla/__init__.py diff --git a/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py similarity index 100% rename from attention/backends/mla/common.py rename to vllm/attention/backends/mla/common.py diff --git a/attention/backends/pallas.py b/vllm/attention/backends/pallas.py similarity index 100% rename from attention/backends/pallas.py rename to vllm/attention/backends/pallas.py diff --git a/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py similarity index 100% rename from attention/backends/placeholder_attn.py rename to vllm/attention/backends/placeholder_attn.py diff --git a/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py similarity index 100% rename from attention/backends/rocm_aiter_mla.py rename to vllm/attention/backends/rocm_aiter_mla.py diff --git a/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py similarity index 100% rename from attention/backends/rocm_flash_attn.py rename to vllm/attention/backends/rocm_flash_attn.py diff --git a/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py similarity index 100% rename from attention/backends/torch_sdpa.py rename to vllm/attention/backends/torch_sdpa.py diff --git a/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py similarity index 100% rename from attention/backends/triton_mla.py rename to vllm/attention/backends/triton_mla.py diff --git a/attention/backends/utils.py b/vllm/attention/backends/utils.py similarity index 100% rename from attention/backends/utils.py rename to vllm/attention/backends/utils.py diff --git a/attention/backends/xformers.py b/vllm/attention/backends/xformers.py similarity index 100% rename from attention/backends/xformers.py rename to vllm/attention/backends/xformers.py diff --git a/attention/layer.py b/vllm/attention/layer.py similarity index 100% rename from attention/layer.py rename to vllm/attention/layer.py diff --git a/attention/ops/__init__.py b/vllm/attention/ops/__init__.py similarity index 100% rename from attention/ops/__init__.py rename to vllm/attention/ops/__init__.py diff --git a/attention/ops/blocksparse_attention/__init__.py b/vllm/attention/ops/blocksparse_attention/__init__.py similarity index 100% rename from attention/ops/blocksparse_attention/__init__.py rename to vllm/attention/ops/blocksparse_attention/__init__.py diff --git a/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py similarity index 100% rename from attention/ops/blocksparse_attention/blocksparse_attention_kernel.py rename to vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py diff --git a/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py similarity index 100% rename from attention/ops/blocksparse_attention/interface.py rename to vllm/attention/ops/blocksparse_attention/interface.py diff --git a/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py similarity index 100% rename from attention/ops/blocksparse_attention/utils.py rename to vllm/attention/ops/blocksparse_attention/utils.py diff --git a/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py similarity index 95% rename from attention/ops/chunked_prefill_paged_decode.py rename to vllm/attention/ops/chunked_prefill_paged_decode.py index 4f83934..b079ab1 100644 --- a/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -28,6 +28,7 @@ def kernel_paged_attention_2d( query_ptr, # [num_tokens, num_query_heads, head_size] key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] @@ -59,6 +60,7 @@ def kernel_paged_attention_2d( stride_v_cache_3: tl.int64, # int filter_by_query_len: tl.constexpr, # bool query_start_len_ptr, # [num_seqs+1] + USE_SINKS: tl.constexpr, # bool ): seq_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) @@ -95,7 +97,18 @@ def kernel_paged_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + if not USE_SINKS: + M = tl.full([num_queries_per_kv_padded], + float("-inf"), + dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_head_idx, + mask=head_mask, + other=float("-inf"), + ).to(dtype=tl.float32) + # M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32) acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -223,6 +236,8 @@ def chunked_prefill_paged_decode( alibi_slopes=None, sliding_window=None, sm_scale=None, + # Optional tensor for sinks + sinks=None, ): if sm_scale is None: @@ -253,6 +268,7 @@ def chunked_prefill_paged_decode( sliding_window=sliding_window, sm_scale=sm_scale, skip_decode=True, + sinks=sinks, ) block_size = value_cache.shape[3] @@ -285,7 +301,7 @@ def chunked_prefill_paged_decode( block_size, num_queries_per_kv, max_seq_len, sliding_window, - kv_cache_dtype, alibi_slopes) + kv_cache_dtype, alibi_slopes, sinks,) if use_custom: _PARTITION_SIZE_ROCM = 256 max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) // @@ -334,6 +350,7 @@ def chunked_prefill_paged_decode( query_ptr=query, key_cache_ptr=key_cache, value_cache_ptr=value_cache, + sink_ptr=sinks, block_tables_ptr=block_table, seq_lens_ptr=seq_lens, alibi_slopes_ptr=alibi_slopes, @@ -365,4 +382,5 @@ def chunked_prefill_paged_decode( stride_v_cache_3=value_cache.stride(3), filter_by_query_len=True, query_start_len_ptr=query_start_loc, + USE_SINKS=sinks is not None, ) diff --git a/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py similarity index 100% rename from attention/ops/flashmla.py rename to vllm/attention/ops/flashmla.py diff --git a/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py similarity index 100% rename from attention/ops/hpu_paged_attn.py rename to vllm/attention/ops/hpu_paged_attn.py diff --git a/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py similarity index 100% rename from attention/ops/ipex_attn.py rename to vllm/attention/ops/ipex_attn.py diff --git a/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py similarity index 100% rename from attention/ops/merge_attn_states.py rename to vllm/attention/ops/merge_attn_states.py diff --git a/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py similarity index 100% rename from attention/ops/nki_flash_attn.py rename to vllm/attention/ops/nki_flash_attn.py diff --git a/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py similarity index 100% rename from attention/ops/paged_attn.py rename to vllm/attention/ops/paged_attn.py diff --git a/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py similarity index 100% rename from attention/ops/prefix_prefill.py rename to vllm/attention/ops/prefix_prefill.py diff --git a/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py similarity index 100% rename from attention/ops/rocm_aiter_mla.py rename to vllm/attention/ops/rocm_aiter_mla.py diff --git a/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py similarity index 100% rename from attention/ops/rocm_aiter_paged_attn.py rename to vllm/attention/ops/rocm_aiter_paged_attn.py diff --git a/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py similarity index 100% rename from attention/ops/triton_decode_attention.py rename to vllm/attention/ops/triton_decode_attention.py diff --git a/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py similarity index 100% rename from attention/ops/triton_flash_attention.py rename to vllm/attention/ops/triton_flash_attention.py diff --git a/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py similarity index 100% rename from attention/ops/triton_merge_attn_states.py rename to vllm/attention/ops/triton_merge_attn_states.py diff --git a/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py similarity index 94% rename from attention/ops/triton_unified_attention.py rename to vllm/attention/ops/triton_unified_attention.py index 92c09e6..585238e 100644 --- a/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -34,6 +34,7 @@ def kernel_unified_attention_2d( query_ptr, # [num_tokens, num_query_heads, head_size] key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + sink_ptr, # [num_query_heads] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] @@ -53,6 +54,7 @@ def kernel_unified_attention_2d( HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int @@ -119,7 +121,16 @@ def kernel_unified_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + if not USE_SINKS: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + # M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) @@ -260,6 +271,8 @@ def unified_attention( k_descale, v_descale, alibi_slopes=None, + # Optional tensor for sinks + sinks=None, ): assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" @@ -267,6 +280,10 @@ def unified_attention( block_size = v.shape[1] assert q.element_size() >= 2 or block_size >= 32, \ "Block size must be at least 32 for fp8" + + if sinks is not None: + assert sinks.shape[0] == q.shape[1], \ + "Sinks must be num_query_heads size" use_alibi_slopes = alibi_slopes is not None @@ -299,6 +316,7 @@ def unified_attention( query_ptr=q, key_cache_ptr=k, value_cache_ptr=v, + sink_ptr=sinks, block_tables_ptr=block_table, seq_lens_ptr=seqused_k, alibi_slopes_ptr=alibi_slopes, @@ -318,6 +336,7 @@ def unified_attention( HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), stride_k_cache_1=k.stride(1), diff --git a/attention/selector.py b/vllm/attention/selector.py similarity index 100% rename from attention/selector.py rename to vllm/attention/selector.py diff --git a/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py similarity index 100% rename from attention/utils/fa_utils.py rename to vllm/attention/utils/fa_utils.py diff --git a/beam_search.py b/vllm/beam_search.py similarity index 100% rename from beam_search.py rename to vllm/beam_search.py diff --git a/benchmarks/__init__.py b/vllm/benchmarks/__init__.py similarity index 100% rename from benchmarks/__init__.py rename to vllm/benchmarks/__init__.py diff --git a/benchmarks/datasets.py b/vllm/benchmarks/datasets.py similarity index 100% rename from benchmarks/datasets.py rename to vllm/benchmarks/datasets.py diff --git a/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py similarity index 100% rename from benchmarks/endpoint_request_func.py rename to vllm/benchmarks/endpoint_request_func.py diff --git a/benchmarks/latency.py b/vllm/benchmarks/latency.py similarity index 100% rename from benchmarks/latency.py rename to vllm/benchmarks/latency.py diff --git a/benchmarks/serve.py b/vllm/benchmarks/serve.py similarity index 100% rename from benchmarks/serve.py rename to vllm/benchmarks/serve.py diff --git a/benchmarks/throughput.py b/vllm/benchmarks/throughput.py similarity index 100% rename from benchmarks/throughput.py rename to vllm/benchmarks/throughput.py diff --git a/benchmarks/utils.py b/vllm/benchmarks/utils.py similarity index 100% rename from benchmarks/utils.py rename to vllm/benchmarks/utils.py diff --git a/collect_env.py b/vllm/collect_env.py similarity index 100% rename from collect_env.py rename to vllm/collect_env.py diff --git a/compilation/__init__.py b/vllm/compilation/__init__.py similarity index 100% rename from compilation/__init__.py rename to vllm/compilation/__init__.py diff --git a/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py similarity index 100% rename from compilation/activation_quant_fusion.py rename to vllm/compilation/activation_quant_fusion.py diff --git a/compilation/backends.py b/vllm/compilation/backends.py similarity index 100% rename from compilation/backends.py rename to vllm/compilation/backends.py diff --git a/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py similarity index 100% rename from compilation/base_piecewise_backend.py rename to vllm/compilation/base_piecewise_backend.py diff --git a/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py similarity index 100% rename from compilation/collective_fusion.py rename to vllm/compilation/collective_fusion.py diff --git a/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py similarity index 100% rename from compilation/compiler_interface.py rename to vllm/compilation/compiler_interface.py diff --git a/compilation/counter.py b/vllm/compilation/counter.py similarity index 100% rename from compilation/counter.py rename to vllm/compilation/counter.py diff --git a/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py similarity index 100% rename from compilation/cuda_piecewise_backend.py rename to vllm/compilation/cuda_piecewise_backend.py diff --git a/compilation/decorators.py b/vllm/compilation/decorators.py similarity index 100% rename from compilation/decorators.py rename to vllm/compilation/decorators.py diff --git a/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py similarity index 100% rename from compilation/fix_functionalization.py rename to vllm/compilation/fix_functionalization.py diff --git a/compilation/fusion.py b/vllm/compilation/fusion.py similarity index 100% rename from compilation/fusion.py rename to vllm/compilation/fusion.py diff --git a/compilation/fx_utils.py b/vllm/compilation/fx_utils.py similarity index 100% rename from compilation/fx_utils.py rename to vllm/compilation/fx_utils.py diff --git a/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py similarity index 100% rename from compilation/inductor_pass.py rename to vllm/compilation/inductor_pass.py diff --git a/compilation/monitor.py b/vllm/compilation/monitor.py similarity index 100% rename from compilation/monitor.py rename to vllm/compilation/monitor.py diff --git a/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py similarity index 100% rename from compilation/multi_output_match.py rename to vllm/compilation/multi_output_match.py diff --git a/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py similarity index 100% rename from compilation/noop_elimination.py rename to vllm/compilation/noop_elimination.py diff --git a/compilation/pass_manager.py b/vllm/compilation/pass_manager.py similarity index 100% rename from compilation/pass_manager.py rename to vllm/compilation/pass_manager.py diff --git a/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py similarity index 100% rename from compilation/sequence_parallelism.py rename to vllm/compilation/sequence_parallelism.py diff --git a/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py similarity index 100% rename from compilation/torch25_custom_graph_pass.py rename to vllm/compilation/torch25_custom_graph_pass.py diff --git a/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py similarity index 100% rename from compilation/vllm_inductor_pass.py rename to vllm/compilation/vllm_inductor_pass.py diff --git a/compilation/wrapper.py b/vllm/compilation/wrapper.py similarity index 100% rename from compilation/wrapper.py rename to vllm/compilation/wrapper.py diff --git a/config.py b/vllm/config.py similarity index 100% rename from config.py rename to vllm/config.py diff --git a/connections.py b/vllm/connections.py similarity index 100% rename from connections.py rename to vllm/connections.py diff --git a/core/__init__.py b/vllm/core/__init__.py similarity index 100% rename from core/__init__.py rename to vllm/core/__init__.py diff --git a/core/block/__init__.py b/vllm/core/block/__init__.py similarity index 100% rename from core/block/__init__.py rename to vllm/core/block/__init__.py diff --git a/core/block/block_table.py b/vllm/core/block/block_table.py similarity index 100% rename from core/block/block_table.py rename to vllm/core/block/block_table.py diff --git a/core/block/common.py b/vllm/core/block/common.py similarity index 100% rename from core/block/common.py rename to vllm/core/block/common.py diff --git a/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py similarity index 100% rename from core/block/cpu_gpu_block_allocator.py rename to vllm/core/block/cpu_gpu_block_allocator.py diff --git a/core/block/interfaces.py b/vllm/core/block/interfaces.py similarity index 100% rename from core/block/interfaces.py rename to vllm/core/block/interfaces.py diff --git a/core/block/naive_block.py b/vllm/core/block/naive_block.py similarity index 100% rename from core/block/naive_block.py rename to vllm/core/block/naive_block.py diff --git a/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py similarity index 100% rename from core/block/prefix_caching_block.py rename to vllm/core/block/prefix_caching_block.py diff --git a/core/block/utils.py b/vllm/core/block/utils.py similarity index 100% rename from core/block/utils.py rename to vllm/core/block/utils.py diff --git a/core/block_manager.py b/vllm/core/block_manager.py similarity index 100% rename from core/block_manager.py rename to vllm/core/block_manager.py diff --git a/core/evictor.py b/vllm/core/evictor.py similarity index 100% rename from core/evictor.py rename to vllm/core/evictor.py diff --git a/core/interfaces.py b/vllm/core/interfaces.py similarity index 100% rename from core/interfaces.py rename to vllm/core/interfaces.py diff --git a/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py similarity index 100% rename from core/placeholder_block_space_manager.py rename to vllm/core/placeholder_block_space_manager.py diff --git a/core/scheduler.py b/vllm/core/scheduler.py similarity index 100% rename from core/scheduler.py rename to vllm/core/scheduler.py diff --git a/cumem_allocator.abi3.so b/vllm/cumem_allocator.abi3.so similarity index 100% rename from cumem_allocator.abi3.so rename to vllm/cumem_allocator.abi3.so diff --git a/device_allocator/__init__.py b/vllm/device_allocator/__init__.py similarity index 100% rename from device_allocator/__init__.py rename to vllm/device_allocator/__init__.py diff --git a/device_allocator/cumem.py b/vllm/device_allocator/cumem.py similarity index 100% rename from device_allocator/cumem.py rename to vllm/device_allocator/cumem.py diff --git a/distributed/__init__.py b/vllm/distributed/__init__.py similarity index 100% rename from distributed/__init__.py rename to vllm/distributed/__init__.py diff --git a/distributed/communication_op.py b/vllm/distributed/communication_op.py similarity index 100% rename from distributed/communication_op.py rename to vllm/distributed/communication_op.py diff --git a/distributed/device_communicators/__init__.py b/vllm/distributed/device_communicators/__init__.py similarity index 100% rename from distributed/device_communicators/__init__.py rename to vllm/distributed/device_communicators/__init__.py diff --git a/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py similarity index 100% rename from distributed/device_communicators/all2all.py rename to vllm/distributed/device_communicators/all2all.py diff --git a/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py similarity index 100% rename from distributed/device_communicators/base_device_communicator.py rename to vllm/distributed/device_communicators/base_device_communicator.py diff --git a/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py similarity index 100% rename from distributed/device_communicators/cpu_communicator.py rename to vllm/distributed/device_communicators/cpu_communicator.py diff --git a/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py similarity index 100% rename from distributed/device_communicators/cuda_communicator.py rename to vllm/distributed/device_communicators/cuda_communicator.py diff --git a/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py similarity index 100% rename from distributed/device_communicators/cuda_wrapper.py rename to vllm/distributed/device_communicators/cuda_wrapper.py diff --git a/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py similarity index 100% rename from distributed/device_communicators/custom_all_reduce.py rename to vllm/distributed/device_communicators/custom_all_reduce.py diff --git a/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py similarity index 100% rename from distributed/device_communicators/custom_all_reduce_utils.py rename to vllm/distributed/device_communicators/custom_all_reduce_utils.py diff --git a/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py similarity index 100% rename from distributed/device_communicators/hpu_communicator.py rename to vllm/distributed/device_communicators/hpu_communicator.py diff --git a/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py similarity index 100% rename from distributed/device_communicators/neuron_communicator.py rename to vllm/distributed/device_communicators/neuron_communicator.py diff --git a/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py similarity index 100% rename from distributed/device_communicators/pynccl.py rename to vllm/distributed/device_communicators/pynccl.py diff --git a/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py similarity index 100% rename from distributed/device_communicators/pynccl_wrapper.py rename to vllm/distributed/device_communicators/pynccl_wrapper.py diff --git a/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py similarity index 100% rename from distributed/device_communicators/shm_broadcast.py rename to vllm/distributed/device_communicators/shm_broadcast.py diff --git a/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py similarity index 100% rename from distributed/device_communicators/tpu_communicator.py rename to vllm/distributed/device_communicators/tpu_communicator.py diff --git a/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py similarity index 100% rename from distributed/device_communicators/xpu_communicator.py rename to vllm/distributed/device_communicators/xpu_communicator.py diff --git a/distributed/kv_events.py b/vllm/distributed/kv_events.py similarity index 100% rename from distributed/kv_events.py rename to vllm/distributed/kv_events.py diff --git a/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md similarity index 100% rename from distributed/kv_transfer/README.md rename to vllm/distributed/kv_transfer/README.md diff --git a/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py similarity index 100% rename from distributed/kv_transfer/__init__.py rename to vllm/distributed/kv_transfer/__init__.py diff --git a/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg similarity index 100% rename from distributed/kv_transfer/disagg_prefill_workflow.jpg rename to vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg diff --git a/distributed/kv_transfer/kv_connector/__init__.py b/vllm/distributed/kv_transfer/kv_connector/__init__.py similarity index 100% rename from distributed/kv_transfer/kv_connector/__init__.py rename to vllm/distributed/kv_transfer/kv_connector/__init__.py diff --git a/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py similarity index 100% rename from distributed/kv_transfer/kv_connector/base.py rename to vllm/distributed/kv_transfer/kv_connector/base.py diff --git a/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py similarity index 100% rename from distributed/kv_transfer/kv_connector/factory.py rename to vllm/distributed/kv_transfer/kv_connector/factory.py diff --git a/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/lmcache_connector.py rename to vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py diff --git a/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/mooncake_store_connector.py rename to vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py diff --git a/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/simple_connector.py rename to vllm/distributed/kv_transfer/kv_connector/simple_connector.py diff --git a/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py similarity index 100% rename from distributed/kv_transfer/kv_connector/utils.py rename to vllm/distributed/kv_transfer/kv_connector/utils.py diff --git a/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/__init__.py rename to vllm/distributed/kv_transfer/kv_connector/v1/__init__.py diff --git a/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/base.py rename to vllm/distributed/kv_transfer/kv_connector/v1/base.py diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/lmcache_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py diff --git a/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/multi_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py diff --git a/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/nixl_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py diff --git a/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py similarity index 100% rename from distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py diff --git a/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py similarity index 100% rename from distributed/kv_transfer/kv_connector_agent.py rename to vllm/distributed/kv_transfer/kv_connector_agent.py diff --git a/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py similarity index 100% rename from distributed/kv_transfer/kv_lookup_buffer/__init__.py rename to vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py diff --git a/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py similarity index 100% rename from distributed/kv_transfer/kv_lookup_buffer/base.py rename to vllm/distributed/kv_transfer/kv_lookup_buffer/base.py diff --git a/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py similarity index 100% rename from distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py rename to vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py diff --git a/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py similarity index 100% rename from distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py rename to vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py diff --git a/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py similarity index 100% rename from distributed/kv_transfer/kv_pipe/__init__.py rename to vllm/distributed/kv_transfer/kv_pipe/__init__.py diff --git a/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py similarity index 100% rename from distributed/kv_transfer/kv_pipe/base.py rename to vllm/distributed/kv_transfer/kv_pipe/base.py diff --git a/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py similarity index 100% rename from distributed/kv_transfer/kv_pipe/mooncake_pipe.py rename to vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py diff --git a/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py similarity index 100% rename from distributed/kv_transfer/kv_pipe/pynccl_pipe.py rename to vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py diff --git a/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py similarity index 100% rename from distributed/kv_transfer/kv_transfer_state.py rename to vllm/distributed/kv_transfer/kv_transfer_state.py diff --git a/distributed/parallel_state.py b/vllm/distributed/parallel_state.py similarity index 100% rename from distributed/parallel_state.py rename to vllm/distributed/parallel_state.py diff --git a/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py similarity index 100% rename from distributed/tpu_distributed_utils.py rename to vllm/distributed/tpu_distributed_utils.py diff --git a/distributed/utils.py b/vllm/distributed/utils.py similarity index 100% rename from distributed/utils.py rename to vllm/distributed/utils.py diff --git a/engine/__init__.py b/vllm/engine/__init__.py similarity index 100% rename from engine/__init__.py rename to vllm/engine/__init__.py diff --git a/engine/arg_utils.py b/vllm/engine/arg_utils.py similarity index 100% rename from engine/arg_utils.py rename to vllm/engine/arg_utils.py diff --git a/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py similarity index 100% rename from engine/async_llm_engine.py rename to vllm/engine/async_llm_engine.py diff --git a/engine/async_timeout.py b/vllm/engine/async_timeout.py similarity index 100% rename from engine/async_timeout.py rename to vllm/engine/async_timeout.py diff --git a/engine/llm_engine.py b/vllm/engine/llm_engine.py similarity index 100% rename from engine/llm_engine.py rename to vllm/engine/llm_engine.py diff --git a/engine/metrics.py b/vllm/engine/metrics.py similarity index 100% rename from engine/metrics.py rename to vllm/engine/metrics.py diff --git a/engine/metrics_types.py b/vllm/engine/metrics_types.py similarity index 100% rename from engine/metrics_types.py rename to vllm/engine/metrics_types.py diff --git a/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py similarity index 100% rename from engine/multiprocessing/__init__.py rename to vllm/engine/multiprocessing/__init__.py diff --git a/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py similarity index 100% rename from engine/multiprocessing/client.py rename to vllm/engine/multiprocessing/client.py diff --git a/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py similarity index 100% rename from engine/multiprocessing/engine.py rename to vllm/engine/multiprocessing/engine.py diff --git a/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py similarity index 100% rename from engine/output_processor/__init__.py rename to vllm/engine/output_processor/__init__.py diff --git a/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py similarity index 100% rename from engine/output_processor/interfaces.py rename to vllm/engine/output_processor/interfaces.py diff --git a/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py similarity index 100% rename from engine/output_processor/multi_step.py rename to vllm/engine/output_processor/multi_step.py diff --git a/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py similarity index 100% rename from engine/output_processor/single_step.py rename to vllm/engine/output_processor/single_step.py diff --git a/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py similarity index 100% rename from engine/output_processor/stop_checker.py rename to vllm/engine/output_processor/stop_checker.py diff --git a/engine/output_processor/util.py b/vllm/engine/output_processor/util.py similarity index 100% rename from engine/output_processor/util.py rename to vllm/engine/output_processor/util.py diff --git a/engine/protocol.py b/vllm/engine/protocol.py similarity index 100% rename from engine/protocol.py rename to vllm/engine/protocol.py diff --git a/entrypoints/__init__.py b/vllm/entrypoints/__init__.py similarity index 100% rename from entrypoints/__init__.py rename to vllm/entrypoints/__init__.py diff --git a/entrypoints/api_server.py b/vllm/entrypoints/api_server.py similarity index 100% rename from entrypoints/api_server.py rename to vllm/entrypoints/api_server.py diff --git a/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py similarity index 100% rename from entrypoints/chat_utils.py rename to vllm/entrypoints/chat_utils.py diff --git a/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py similarity index 100% rename from entrypoints/cli/__init__.py rename to vllm/entrypoints/cli/__init__.py diff --git a/entrypoints/cli/benchmark/__init__.py b/vllm/entrypoints/cli/benchmark/__init__.py similarity index 100% rename from entrypoints/cli/benchmark/__init__.py rename to vllm/entrypoints/cli/benchmark/__init__.py diff --git a/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py similarity index 100% rename from entrypoints/cli/benchmark/base.py rename to vllm/entrypoints/cli/benchmark/base.py diff --git a/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py similarity index 100% rename from entrypoints/cli/benchmark/latency.py rename to vllm/entrypoints/cli/benchmark/latency.py diff --git a/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py similarity index 100% rename from entrypoints/cli/benchmark/main.py rename to vllm/entrypoints/cli/benchmark/main.py diff --git a/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py similarity index 100% rename from entrypoints/cli/benchmark/serve.py rename to vllm/entrypoints/cli/benchmark/serve.py diff --git a/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py similarity index 100% rename from entrypoints/cli/benchmark/throughput.py rename to vllm/entrypoints/cli/benchmark/throughput.py diff --git a/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py similarity index 100% rename from entrypoints/cli/collect_env.py rename to vllm/entrypoints/cli/collect_env.py diff --git a/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py similarity index 100% rename from entrypoints/cli/main.py rename to vllm/entrypoints/cli/main.py diff --git a/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py similarity index 100% rename from entrypoints/cli/openai.py rename to vllm/entrypoints/cli/openai.py diff --git a/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py similarity index 100% rename from entrypoints/cli/run_batch.py rename to vllm/entrypoints/cli/run_batch.py diff --git a/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py similarity index 100% rename from entrypoints/cli/serve.py rename to vllm/entrypoints/cli/serve.py diff --git a/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py similarity index 100% rename from entrypoints/cli/types.py rename to vllm/entrypoints/cli/types.py diff --git a/entrypoints/launcher.py b/vllm/entrypoints/launcher.py similarity index 100% rename from entrypoints/launcher.py rename to vllm/entrypoints/launcher.py diff --git a/entrypoints/llm.py b/vllm/entrypoints/llm.py similarity index 100% rename from entrypoints/llm.py rename to vllm/entrypoints/llm.py diff --git a/entrypoints/logger.py b/vllm/entrypoints/logger.py similarity index 100% rename from entrypoints/logger.py rename to vllm/entrypoints/logger.py diff --git a/entrypoints/openai/__init__.py b/vllm/entrypoints/openai/__init__.py similarity index 100% rename from entrypoints/openai/__init__.py rename to vllm/entrypoints/openai/__init__.py diff --git a/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py similarity index 100% rename from entrypoints/openai/api_server.py rename to vllm/entrypoints/openai/api_server.py diff --git a/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py similarity index 100% rename from entrypoints/openai/cli_args.py rename to vllm/entrypoints/openai/cli_args.py diff --git a/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py similarity index 100% rename from entrypoints/openai/logits_processors.py rename to vllm/entrypoints/openai/logits_processors.py diff --git a/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py similarity index 100% rename from entrypoints/openai/protocol.py rename to vllm/entrypoints/openai/protocol.py diff --git a/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py similarity index 100% rename from entrypoints/openai/run_batch.py rename to vllm/entrypoints/openai/run_batch.py diff --git a/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py similarity index 100% rename from entrypoints/openai/serving_chat.py rename to vllm/entrypoints/openai/serving_chat.py diff --git a/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py similarity index 100% rename from entrypoints/openai/serving_classification.py rename to vllm/entrypoints/openai/serving_classification.py diff --git a/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py similarity index 100% rename from entrypoints/openai/serving_completion.py rename to vllm/entrypoints/openai/serving_completion.py diff --git a/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py similarity index 100% rename from entrypoints/openai/serving_embedding.py rename to vllm/entrypoints/openai/serving_embedding.py diff --git a/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py similarity index 100% rename from entrypoints/openai/serving_engine.py rename to vllm/entrypoints/openai/serving_engine.py diff --git a/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py similarity index 100% rename from entrypoints/openai/serving_models.py rename to vllm/entrypoints/openai/serving_models.py diff --git a/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py similarity index 100% rename from entrypoints/openai/serving_pooling.py rename to vllm/entrypoints/openai/serving_pooling.py diff --git a/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py similarity index 100% rename from entrypoints/openai/serving_score.py rename to vllm/entrypoints/openai/serving_score.py diff --git a/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py similarity index 100% rename from entrypoints/openai/serving_tokenization.py rename to vllm/entrypoints/openai/serving_tokenization.py diff --git a/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py similarity index 100% rename from entrypoints/openai/serving_transcription.py rename to vllm/entrypoints/openai/serving_transcription.py diff --git a/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py similarity index 100% rename from entrypoints/openai/tool_parsers/__init__.py rename to vllm/entrypoints/openai/tool_parsers/__init__.py diff --git a/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/abstract_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/granite_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/hermes_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/internlm2_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/jamba_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/llama_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/mistral_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/phi4mini_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py similarity index 100% rename from entrypoints/openai/tool_parsers/pythonic_tool_parser.py rename to vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py diff --git a/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py similarity index 100% rename from entrypoints/openai/tool_parsers/utils.py rename to vllm/entrypoints/openai/tool_parsers/utils.py diff --git a/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py similarity index 100% rename from entrypoints/score_utils.py rename to vllm/entrypoints/score_utils.py diff --git a/entrypoints/ssl.py b/vllm/entrypoints/ssl.py similarity index 100% rename from entrypoints/ssl.py rename to vllm/entrypoints/ssl.py diff --git a/entrypoints/utils.py b/vllm/entrypoints/utils.py similarity index 100% rename from entrypoints/utils.py rename to vllm/entrypoints/utils.py diff --git a/env_override.py b/vllm/env_override.py similarity index 100% rename from env_override.py rename to vllm/env_override.py diff --git a/envs.py b/vllm/envs.py similarity index 100% rename from envs.py rename to vllm/envs.py diff --git a/executor/__init__.py b/vllm/executor/__init__.py similarity index 100% rename from executor/__init__.py rename to vllm/executor/__init__.py diff --git a/executor/executor_base.py b/vllm/executor/executor_base.py similarity index 100% rename from executor/executor_base.py rename to vllm/executor/executor_base.py diff --git a/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py similarity index 100% rename from executor/mp_distributed_executor.py rename to vllm/executor/mp_distributed_executor.py diff --git a/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py similarity index 100% rename from executor/msgspec_utils.py rename to vllm/executor/msgspec_utils.py diff --git a/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py similarity index 100% rename from executor/multiproc_worker_utils.py rename to vllm/executor/multiproc_worker_utils.py diff --git a/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py similarity index 100% rename from executor/ray_distributed_executor.py rename to vllm/executor/ray_distributed_executor.py diff --git a/executor/ray_utils.py b/vllm/executor/ray_utils.py similarity index 100% rename from executor/ray_utils.py rename to vllm/executor/ray_utils.py diff --git a/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py similarity index 100% rename from executor/uniproc_executor.py rename to vllm/executor/uniproc_executor.py diff --git a/forward_context.py b/vllm/forward_context.py similarity index 100% rename from forward_context.py rename to vllm/forward_context.py diff --git a/inputs/__init__.py b/vllm/inputs/__init__.py similarity index 100% rename from inputs/__init__.py rename to vllm/inputs/__init__.py diff --git a/inputs/data.py b/vllm/inputs/data.py similarity index 100% rename from inputs/data.py rename to vllm/inputs/data.py diff --git a/inputs/parse.py b/vllm/inputs/parse.py similarity index 100% rename from inputs/parse.py rename to vllm/inputs/parse.py diff --git a/inputs/preprocess.py b/vllm/inputs/preprocess.py similarity index 100% rename from inputs/preprocess.py rename to vllm/inputs/preprocess.py diff --git a/inputs/registry.py b/vllm/inputs/registry.py similarity index 100% rename from inputs/registry.py rename to vllm/inputs/registry.py diff --git a/jsontree.py b/vllm/jsontree.py similarity index 100% rename from jsontree.py rename to vllm/jsontree.py diff --git a/logger.py b/vllm/logger.py similarity index 100% rename from logger.py rename to vllm/logger.py diff --git a/logging_utils/__init__.py b/vllm/logging_utils/__init__.py similarity index 100% rename from logging_utils/__init__.py rename to vllm/logging_utils/__init__.py diff --git a/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py similarity index 100% rename from logging_utils/dump_input.py rename to vllm/logging_utils/dump_input.py diff --git a/logging_utils/formatter.py b/vllm/logging_utils/formatter.py similarity index 100% rename from logging_utils/formatter.py rename to vllm/logging_utils/formatter.py diff --git a/logits_process.py b/vllm/logits_process.py similarity index 100% rename from logits_process.py rename to vllm/logits_process.py diff --git a/lora/__init__.py b/vllm/lora/__init__.py similarity index 100% rename from lora/__init__.py rename to vllm/lora/__init__.py diff --git a/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py similarity index 100% rename from lora/fully_sharded_layers.py rename to vllm/lora/fully_sharded_layers.py diff --git a/lora/layers.py b/vllm/lora/layers.py similarity index 100% rename from lora/layers.py rename to vllm/lora/layers.py diff --git a/lora/lora.py b/vllm/lora/lora.py similarity index 100% rename from lora/lora.py rename to vllm/lora/lora.py diff --git a/lora/models.py b/vllm/lora/models.py similarity index 100% rename from lora/models.py rename to vllm/lora/models.py diff --git a/lora/ops/__init__.py b/vllm/lora/ops/__init__.py similarity index 100% rename from lora/ops/__init__.py rename to vllm/lora/ops/__init__.py diff --git a/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py similarity index 100% rename from lora/ops/torch_ops/__init__.py rename to vllm/lora/ops/torch_ops/__init__.py diff --git a/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py similarity index 100% rename from lora/ops/torch_ops/lora_ops.py rename to vllm/lora/ops/torch_ops/lora_ops.py diff --git a/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py similarity index 100% rename from lora/ops/triton_ops/__init__.py rename to vllm/lora/ops/triton_ops/__init__.py diff --git a/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py similarity index 100% rename from lora/ops/triton_ops/kernel_utils.py rename to vllm/lora/ops/triton_ops/kernel_utils.py diff --git a/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py similarity index 100% rename from lora/ops/triton_ops/lora_expand_op.py rename to vllm/lora/ops/triton_ops/lora_expand_op.py diff --git a/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py similarity index 100% rename from lora/ops/triton_ops/lora_kernel_metadata.py rename to vllm/lora/ops/triton_ops/lora_kernel_metadata.py diff --git a/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py similarity index 100% rename from lora/ops/triton_ops/lora_shrink_op.py rename to vllm/lora/ops/triton_ops/lora_shrink_op.py diff --git a/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py similarity index 100% rename from lora/ops/triton_ops/utils.py rename to vllm/lora/ops/triton_ops/utils.py diff --git a/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py similarity index 100% rename from lora/ops/xla_ops/__init__.py rename to vllm/lora/ops/xla_ops/__init__.py diff --git a/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py similarity index 100% rename from lora/ops/xla_ops/lora_ops.py rename to vllm/lora/ops/xla_ops/lora_ops.py diff --git a/lora/peft_helper.py b/vllm/lora/peft_helper.py similarity index 100% rename from lora/peft_helper.py rename to vllm/lora/peft_helper.py diff --git a/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py similarity index 100% rename from lora/punica_wrapper/__init__.py rename to vllm/lora/punica_wrapper/__init__.py diff --git a/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py similarity index 100% rename from lora/punica_wrapper/punica_base.py rename to vllm/lora/punica_wrapper/punica_base.py diff --git a/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py similarity index 100% rename from lora/punica_wrapper/punica_cpu.py rename to vllm/lora/punica_wrapper/punica_cpu.py diff --git a/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py similarity index 100% rename from lora/punica_wrapper/punica_gpu.py rename to vllm/lora/punica_wrapper/punica_gpu.py diff --git a/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py similarity index 100% rename from lora/punica_wrapper/punica_hpu.py rename to vllm/lora/punica_wrapper/punica_hpu.py diff --git a/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py similarity index 100% rename from lora/punica_wrapper/punica_selector.py rename to vllm/lora/punica_wrapper/punica_selector.py diff --git a/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py similarity index 100% rename from lora/punica_wrapper/punica_tpu.py rename to vllm/lora/punica_wrapper/punica_tpu.py diff --git a/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py similarity index 100% rename from lora/punica_wrapper/utils.py rename to vllm/lora/punica_wrapper/utils.py diff --git a/lora/request.py b/vllm/lora/request.py similarity index 100% rename from lora/request.py rename to vllm/lora/request.py diff --git a/lora/resolver.py b/vllm/lora/resolver.py similarity index 100% rename from lora/resolver.py rename to vllm/lora/resolver.py diff --git a/lora/utils.py b/vllm/lora/utils.py similarity index 100% rename from lora/utils.py rename to vllm/lora/utils.py diff --git a/lora/worker_manager.py b/vllm/lora/worker_manager.py similarity index 100% rename from lora/worker_manager.py rename to vllm/lora/worker_manager.py diff --git a/model_executor/__init__.py b/vllm/model_executor/__init__.py similarity index 100% rename from model_executor/__init__.py rename to vllm/model_executor/__init__.py diff --git a/model_executor/custom_op.py b/vllm/model_executor/custom_op.py similarity index 100% rename from model_executor/custom_op.py rename to vllm/model_executor/custom_op.py diff --git a/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py similarity index 100% rename from model_executor/guided_decoding/__init__.py rename to vllm/model_executor/guided_decoding/__init__.py diff --git a/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py similarity index 100% rename from model_executor/guided_decoding/guidance_decoding.py rename to vllm/model_executor/guided_decoding/guidance_decoding.py diff --git a/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py similarity index 100% rename from model_executor/guided_decoding/guidance_logits_processors.py rename to vllm/model_executor/guided_decoding/guidance_logits_processors.py diff --git a/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py similarity index 100% rename from model_executor/guided_decoding/guided_fields.py rename to vllm/model_executor/guided_decoding/guided_fields.py diff --git a/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py similarity index 100% rename from model_executor/guided_decoding/lm_format_enforcer_decoding.py rename to vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py diff --git a/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py similarity index 100% rename from model_executor/guided_decoding/outlines_decoding.py rename to vllm/model_executor/guided_decoding/outlines_decoding.py diff --git a/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py similarity index 100% rename from model_executor/guided_decoding/outlines_logits_processors.py rename to vllm/model_executor/guided_decoding/outlines_logits_processors.py diff --git a/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py similarity index 100% rename from model_executor/guided_decoding/utils.py rename to vllm/model_executor/guided_decoding/utils.py diff --git a/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py similarity index 100% rename from model_executor/guided_decoding/xgrammar_decoding.py rename to vllm/model_executor/guided_decoding/xgrammar_decoding.py diff --git a/model_executor/layers/__init__.py b/vllm/model_executor/layers/__init__.py similarity index 100% rename from model_executor/layers/__init__.py rename to vllm/model_executor/layers/__init__.py diff --git a/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py similarity index 100% rename from model_executor/layers/activation.py rename to vllm/model_executor/layers/activation.py diff --git a/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py similarity index 100% rename from model_executor/layers/fused_moe/__init__.py rename to vllm/model_executor/layers/fused_moe/__init__.py diff --git a/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py similarity index 100% rename from model_executor/layers/fused_moe/batched_deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py diff --git a/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py similarity index 100% rename from model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py diff --git a/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json rename to vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=160,N=96,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w4a16.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w4a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w4a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w4a16.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w8a16.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w8a16.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w8a16.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int4_w8a16.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json rename to vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4001.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4001.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4001.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=Device_4001.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/H=2048,E=128,N=192,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=192,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=2048,E=128,N=192,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=192,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/H=2048,E=128,N=384,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=192,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json rename to vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000,dtype=int8_w8a8.json diff --git a/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000.json b/vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000.json similarity index 100% rename from model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000.json rename to vllm/model_executor/layers/fused_moe/configs/H=4096,E=128,N=384,device_name=Device_4000.json diff --git a/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README similarity index 100% rename from model_executor/layers/fused_moe/configs/README rename to vllm/model_executor/layers/fused_moe/configs/README diff --git a/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py similarity index 100% rename from model_executor/layers/fused_moe/cutlass_moe.py rename to vllm/model_executor/layers/fused_moe/cutlass_moe.py diff --git a/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py similarity index 100% rename from model_executor/layers/fused_moe/deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/deep_gemm_moe.py diff --git a/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py similarity index 100% rename from model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py diff --git a/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py similarity index 100% rename from model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py diff --git a/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py similarity index 100% rename from model_executor/layers/fused_moe/fused_batched_moe.py rename to vllm/model_executor/layers/fused_moe/fused_batched_moe.py diff --git a/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py similarity index 100% rename from model_executor/layers/fused_moe/fused_marlin_moe.py rename to vllm/model_executor/layers/fused_moe/fused_marlin_moe.py diff --git a/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py similarity index 95% rename from model_executor/layers/fused_moe/fused_moe.py rename to vllm/model_executor/layers/fused_moe/fused_moe.py index 9d74c68..a16b66a 100644 --- a/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -275,6 +275,7 @@ def fused_moe_kernel( a_ptr, b_ptr, c_ptr, + b_bias_ptr, a_scale_ptr, b_scale_ptr, topk_weights_ptr, @@ -303,6 +304,8 @@ def fused_moe_kernel( stride_bse, stride_bsk, stride_bsn, + stride_bbe, # bias expert stride + stride_bbn, # bias N stride # Block size for block-wise quantization group_n: tl.constexpr, group_k: tl.constexpr, @@ -321,6 +324,7 @@ def fused_moe_kernel( use_int8_w8a8: tl.constexpr, use_int8_w8a16: tl.constexpr, per_channel_quant: tl.constexpr, + HAS_BIAS: tl.constexpr, UPGRADE: tl.constexpr, UPGRADE_A_OFFS: tl.constexpr, UPGRADE_B_OFFS: tl.constexpr, @@ -447,6 +451,10 @@ def fused_moe_kernel( else: a_scale = tl.load(a_scale_ptr) b_scale = tl.load(b_scale_ptr + off_experts) + if HAS_BIAS: + # bias shape: [num_experts, N] + bias_ptrs = b_bias_ptr + off_experts * stride_bbe + offs_bn * stride_bbn + bias = tl.load(bias_ptrs, mask=(offs_bn < N), other=0.0) # ----------------------------------------------------------- # Iterate to compute a block of the C matrix. @@ -494,7 +502,8 @@ def fused_moe_kernel( # Advance the ptrs to the next K block. a_ptrs += BLOCK_SIZE_K * stride_ak * SPLIT_K b_ptrs += BLOCK_SIZE_K * stride_bk * SPLIT_K - + if HAS_BIAS: + accumulator = accumulator + bias[None, :] if MUL_ROUTED_WEIGHT: moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, @@ -548,7 +557,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, use_int4_w4a16: bool, orig_acc_dtype: torch.dtype, per_channel_quant: bool, - block_shape: Optional[list[int]] = None) -> None: + block_shape: Optional[list[int]] = None, + B_bias: Optional[torch.Tensor] = None) -> None: assert topk_weights is not None or not mul_routed_weight assert topk_weights is None or topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -580,7 +590,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, A.shape[0] * top_k * config['BLOCK_SIZE_M']) grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( B.shape[1], META['BLOCK_SIZE_N']), META['SPLIT_K']) - + HAS_BIAS = B_bias is not None if (use_int8_w8a16 or use_int4_w4a16) and \ block_shape is not None and block_shape[1] > 0: assert B_scale is not None and B_scale.ndim == 3 @@ -592,19 +602,19 @@ def invoke_fused_moe_kernel(A: torch.Tensor, num_experts=B.shape[0], bit=4 if use_int4_w4a16 else 8) # TODO: missing config for BLOCK_SIZE_K - # config = config.copy() - # config.update( - # get_moe_wna16_block_config(config=config, - # use_moe_wna16_cuda=use_moe_wna16_cuda, - # num_valid_tokens=num_tokens, - # size_k=A.shape[1], - # size_n=B.shape[1], - # num_experts=B.shape[1], - # group_size=block_shape[1], - # real_top_k=top_k, - # block_size_m=config["BLOCK_SIZE_M"])) + config = config.copy() + config.update( + get_moe_wna16_block_config(config=config, + use_moe_wna16_cuda=use_moe_wna16_cuda, + num_valid_tokens=num_tokens, + size_k=A.shape[1], + size_n=B.shape[1], + num_experts=B.shape[1], + group_size=block_shape[1], + real_top_k=top_k, + block_size_m=config["BLOCK_SIZE_M"])) - if False and use_moe_wna16_cuda: + if use_moe_wna16_cuda: bit = 4 if use_int4_w4a16 else 8 ops.moe_wna16_gemm(A, C, B, B_scale, B_zp, topk_weights if mul_routed_weight else None, @@ -661,6 +671,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, A, B, C, + B_bias, A_scale, B_scale, topk_weights, @@ -689,6 +700,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, if B_scale is not None and B_scale.ndim == 3 else 0, B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_bias.stride(0) if B_bias is not None else 0, + B_bias.stride(1) if B_bias is not None else 0, 0 if block_shape is None else block_shape[0], 0 if block_shape is None else block_shape[1], MUL_ROUTED_WEIGHT=mul_routed_weight, @@ -699,6 +712,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, use_int8_w8a8=use_int8_w8a8, use_int8_w8a16=use_int8_w8a16, per_channel_quant=per_channel_quant, + HAS_BIAS=HAS_BIAS, BLOCK_SIZE_K=BLOCK_SIZE_K, FAST_F32_TO_BF16 = True, **config, @@ -1103,13 +1117,15 @@ def inplace_fused_experts(hidden_states: torch.Tensor, w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> None: + block_shape: Optional[List[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, activation, apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, - block_shape) + block_shape, w1_bias, w2_bias) def inplace_fused_experts_fake( @@ -1133,7 +1149,9 @@ def inplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> None: + block_shape: Optional[List[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> None: pass @@ -1167,14 +1185,16 @@ def outplace_fused_experts( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, False, activation, apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, - block_shape) + block_shape, w1_bias, w2_bias) def outplace_fused_experts_fake( @@ -1197,7 +1217,9 @@ def outplace_fused_experts_fake( w2_zp: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[List[int]] = None) -> torch.Tensor: + block_shape: Optional[List[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1248,7 +1270,9 @@ def fused_experts(hidden_states: torch.Tensor, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, - allow_deep_gemm: bool = False) -> torch.Tensor: + allow_deep_gemm: bool = False, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor: # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. N = w1.shape[1] @@ -1293,7 +1317,10 @@ def fused_experts(hidden_states: torch.Tensor, w2_zp=w2_zp, a1_scale=a1_scale, a2_scale=a2_scale, - block_shape=block_shape) + block_shape=block_shape, + w1_bias=w1_bias, + w2_bias=w2_bias, + ) def fused_experts_impl( @@ -1319,6 +1346,8 @@ def fused_experts_impl( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: # Check constraints. if use_int4_w4a16: @@ -1498,7 +1527,19 @@ def fused_experts_impl( use_int4_w4a16=use_int4_w4a16, orig_acc_dtype=hidden_states.dtype, per_channel_quant=per_channel_quant, - block_shape=block_shape) + block_shape=block_shape, + B_bias=w1_bias) + + # TODO fused kernel + def swiglu_oai(gate_up): + alpha = 1.702 + limit = 7.0 + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(gate * alpha) + gated_output = (up + 1) * glu + return gated_output if activation == "silu": torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1506,6 +1547,8 @@ def fused_experts_impl( elif activation == "gelu": torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + elif activation == "swiglu_oai": + intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) else: raise ValueError(f"Unsupported FusedMoe activation: {activation}") @@ -1543,7 +1586,8 @@ def fused_experts_impl( use_int4_w4a16=use_int4_w4a16, orig_acc_dtype=hidden_states.dtype, per_channel_quant=per_channel_quant, - block_shape=block_shape) + block_shape=block_shape, + B_bias=w2_bias) ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx]) @@ -1578,6 +1622,8 @@ def fused_moe( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, + w1_bias: Optional[torch.Tensor] = None, + w2_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -1661,7 +1707,9 @@ def fused_moe( w2_zp=w2_zp, a1_scale=a1_scale, a2_scale=a2_scale, - block_shape=block_shape) + block_shape=block_shape, + w1_bias=w1_bias, + w2_bias=w2_bias) class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -1805,7 +1853,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, per_channel_quant=self.per_channel_quant, - block_shape=self.block_shape) + block_shape=self.block_shape, + B_bias=None # TODO support B_bias + ) self.activation(activation, intermediate_cache2, intermediate_cache1.view(-1, N)) @@ -1835,7 +1885,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, per_channel_quant=self.per_channel_quant, - block_shape=self.block_shape) + block_shape=self.block_shape, + B_bias=None # TODO support B_bias + ) return intermediate_cache3 diff --git a/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py similarity index 98% rename from model_executor/layers/fused_moe/layer.py rename to vllm/model_executor/layers/fused_moe/layer.py index 074e690..337e0c0 100644 --- a/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -226,6 +226,8 @@ class MoEConfig: max_num_tokens: int = MOE_DP_CHUNK_SIZE + has_bias: bool = False + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -443,6 +445,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self.fused_experts = fused_experts # type: ignore self.topk_indices_dtype = None self.moe = moe + self.has_bias = self.moe.has_bias self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: @@ -502,6 +505,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) + if self.has_bias: + w13_bias = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) # down_proj (row parallel) w2_weight = torch.nn.Parameter(torch.empty( @@ -512,6 +523,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + if self.has_bias: + w2_bias = torch.nn.Parameter(torch.zeros(num_experts, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: # Pad the weight tensor. This is an optimization on ROCm platform, which @@ -634,6 +652,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, + w1_bias=layer.w13_bias if self.has_bias else None, + w2_bias=layer.w2_bias if self.has_bias else None, topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, @@ -840,6 +860,7 @@ class FusedMoE(torch.nn.Module): e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", + has_bias: bool = False, ): super().__init__() if params_dtype is None: @@ -920,6 +941,7 @@ class FusedMoE(torch.nn.Module): in_dtype=params_dtype, quant_dtype=quant_dtype, max_num_tokens=MOE_DP_CHUNK_SIZE, + has_bias=has_bias, ) self.moe_config = moe self.quant_config = quant_config diff --git a/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py similarity index 100% rename from model_executor/layers/fused_moe/modular_kernel.py rename to vllm/model_executor/layers/fused_moe/modular_kernel.py diff --git a/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py similarity index 100% rename from model_executor/layers/fused_moe/moe_align_block_size.py rename to vllm/model_executor/layers/fused_moe/moe_align_block_size.py diff --git a/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py similarity index 100% rename from model_executor/layers/fused_moe/moe_pallas.py rename to vllm/model_executor/layers/fused_moe/moe_pallas.py diff --git a/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py similarity index 100% rename from model_executor/layers/fused_moe/moe_permute_unpermute.py rename to vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py diff --git a/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py similarity index 100% rename from model_executor/layers/fused_moe/moe_torch_iterative.py rename to vllm/model_executor/layers/fused_moe/moe_torch_iterative.py diff --git a/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py similarity index 100% rename from model_executor/layers/fused_moe/pplx_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py diff --git a/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py similarity index 100% rename from model_executor/layers/fused_moe/prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/prepare_finalize.py diff --git a/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py similarity index 100% rename from model_executor/layers/fused_moe/rocm_aiter_fused_moe.py rename to vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py diff --git a/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py similarity index 100% rename from model_executor/layers/fused_moe/triton_deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py diff --git a/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py similarity index 100% rename from model_executor/layers/fused_moe/utils.py rename to vllm/model_executor/layers/fused_moe/utils.py diff --git a/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py similarity index 100% rename from model_executor/layers/layernorm.py rename to vllm/model_executor/layers/layernorm.py diff --git a/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py similarity index 100% rename from model_executor/layers/lightning_attn.py rename to vllm/model_executor/layers/lightning_attn.py diff --git a/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py similarity index 100% rename from model_executor/layers/linear.py rename to vllm/model_executor/layers/linear.py diff --git a/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py similarity index 100% rename from model_executor/layers/logits_processor.py rename to vllm/model_executor/layers/logits_processor.py diff --git a/model_executor/layers/mamba/__init__.py b/vllm/model_executor/layers/mamba/__init__.py similarity index 100% rename from model_executor/layers/mamba/__init__.py rename to vllm/model_executor/layers/mamba/__init__.py diff --git a/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py similarity index 100% rename from model_executor/layers/mamba/mamba2_metadata.py rename to vllm/model_executor/layers/mamba/mamba2_metadata.py diff --git a/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py similarity index 100% rename from model_executor/layers/mamba/mamba_mixer.py rename to vllm/model_executor/layers/mamba/mamba_mixer.py diff --git a/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py similarity index 100% rename from model_executor/layers/mamba/mamba_mixer2.py rename to vllm/model_executor/layers/mamba/mamba_mixer2.py diff --git a/model_executor/layers/mamba/ops/__init__.py b/vllm/model_executor/layers/mamba/ops/__init__.py similarity index 100% rename from model_executor/layers/mamba/ops/__init__.py rename to vllm/model_executor/layers/mamba/ops/__init__.py diff --git a/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py similarity index 100% rename from model_executor/layers/mamba/ops/causal_conv1d.py rename to vllm/model_executor/layers/mamba/ops/causal_conv1d.py diff --git a/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py similarity index 100% rename from model_executor/layers/mamba/ops/mamba_ssm.py rename to vllm/model_executor/layers/mamba/ops/mamba_ssm.py diff --git a/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py similarity index 100% rename from model_executor/layers/mamba/ops/ssd_bmm.py rename to vllm/model_executor/layers/mamba/ops/ssd_bmm.py diff --git a/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py similarity index 100% rename from model_executor/layers/mamba/ops/ssd_chunk_scan.py rename to vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py diff --git a/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py similarity index 100% rename from model_executor/layers/mamba/ops/ssd_chunk_state.py rename to vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py diff --git a/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py similarity index 100% rename from model_executor/layers/mamba/ops/ssd_combined.py rename to vllm/model_executor/layers/mamba/ops/ssd_combined.py diff --git a/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py similarity index 100% rename from model_executor/layers/mamba/ops/ssd_state_passing.py rename to vllm/model_executor/layers/mamba/ops/ssd_state_passing.py diff --git a/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py similarity index 100% rename from model_executor/layers/pooler.py rename to vllm/model_executor/layers/pooler.py diff --git a/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py similarity index 100% rename from model_executor/layers/quantization/__init__.py rename to vllm/model_executor/layers/quantization/__init__.py diff --git a/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py similarity index 100% rename from model_executor/layers/quantization/aqlm.py rename to vllm/model_executor/layers/quantization/aqlm.py diff --git a/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py similarity index 100% rename from model_executor/layers/quantization/auto_round.py rename to vllm/model_executor/layers/quantization/auto_round.py diff --git a/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py similarity index 100% rename from model_executor/layers/quantization/awq.py rename to vllm/model_executor/layers/quantization/awq.py diff --git a/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py similarity index 100% rename from model_executor/layers/quantization/awq_marlin.py rename to vllm/model_executor/layers/quantization/awq_marlin.py diff --git a/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py similarity index 100% rename from model_executor/layers/quantization/awq_triton.py rename to vllm/model_executor/layers/quantization/awq_triton.py diff --git a/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py similarity index 100% rename from model_executor/layers/quantization/base_config.py rename to vllm/model_executor/layers/quantization/base_config.py diff --git a/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py similarity index 100% rename from model_executor/layers/quantization/bitblas.py rename to vllm/model_executor/layers/quantization/bitblas.py diff --git a/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py similarity index 100% rename from model_executor/layers/quantization/bitsandbytes.py rename to vllm/model_executor/layers/quantization/bitsandbytes.py diff --git a/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/__init__.py rename to vllm/model_executor/layers/quantization/compressed_tensors/__init__.py diff --git a/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/compressed_tensors.py rename to vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py diff --git a/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py rename to vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/__init__.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py rename to vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py diff --git a/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py rename to vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py diff --git a/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py similarity index 100% rename from model_executor/layers/quantization/compressed_tensors/utils.py rename to vllm/model_executor/layers/quantization/compressed_tensors/utils.py diff --git a/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py similarity index 100% rename from model_executor/layers/quantization/deepspeedfp.py rename to vllm/model_executor/layers/quantization/deepspeedfp.py diff --git a/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py similarity index 100% rename from model_executor/layers/quantization/experts_int8.py rename to vllm/model_executor/layers/quantization/experts_int8.py diff --git a/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py similarity index 100% rename from model_executor/layers/quantization/fbgemm_fp8.py rename to vllm/model_executor/layers/quantization/fbgemm_fp8.py diff --git a/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py similarity index 100% rename from model_executor/layers/quantization/fp8.py rename to vllm/model_executor/layers/quantization/fp8.py diff --git a/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py similarity index 100% rename from model_executor/layers/quantization/gguf.py rename to vllm/model_executor/layers/quantization/gguf.py diff --git a/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py similarity index 100% rename from model_executor/layers/quantization/gptq.py rename to vllm/model_executor/layers/quantization/gptq.py diff --git a/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py similarity index 100% rename from model_executor/layers/quantization/gptq_bitblas.py rename to vllm/model_executor/layers/quantization/gptq_bitblas.py diff --git a/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py similarity index 100% rename from model_executor/layers/quantization/gptq_marlin.py rename to vllm/model_executor/layers/quantization/gptq_marlin.py diff --git a/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py similarity index 100% rename from model_executor/layers/quantization/gptq_marlin_24.py rename to vllm/model_executor/layers/quantization/gptq_marlin_24.py diff --git a/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py similarity index 100% rename from model_executor/layers/quantization/hqq_marlin.py rename to vllm/model_executor/layers/quantization/hqq_marlin.py diff --git a/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py similarity index 100% rename from model_executor/layers/quantization/ipex_quant.py rename to vllm/model_executor/layers/quantization/ipex_quant.py diff --git a/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py similarity index 100% rename from model_executor/layers/quantization/kernels/__init__.py rename to vllm/model_executor/layers/quantization/kernels/__init__.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/__init__.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/allspark.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/bitblas.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/exllama.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/machete.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py diff --git a/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py similarity index 100% rename from model_executor/layers/quantization/kernels/mixed_precision/marlin.py rename to vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/__init__.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/aiter.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/cutlass.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/triton.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py diff --git a/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py similarity index 100% rename from model_executor/layers/quantization/kernels/scaled_mm/xla.py rename to vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py diff --git a/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py similarity index 100% rename from model_executor/layers/quantization/kv_cache.py rename to vllm/model_executor/layers/quantization/kv_cache.py diff --git a/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py similarity index 100% rename from model_executor/layers/quantization/marlin.py rename to vllm/model_executor/layers/quantization/marlin.py diff --git a/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py similarity index 100% rename from model_executor/layers/quantization/modelopt.py rename to vllm/model_executor/layers/quantization/modelopt.py diff --git a/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py similarity index 100% rename from model_executor/layers/quantization/moe_wna16.py rename to vllm/model_executor/layers/quantization/moe_wna16.py diff --git a/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py similarity index 100% rename from model_executor/layers/quantization/neuron_quant.py rename to vllm/model_executor/layers/quantization/neuron_quant.py diff --git a/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py similarity index 100% rename from model_executor/layers/quantization/ptpc_fp8.py rename to vllm/model_executor/layers/quantization/ptpc_fp8.py diff --git a/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py similarity index 100% rename from model_executor/layers/quantization/qqq.py rename to vllm/model_executor/layers/quantization/qqq.py diff --git a/model_executor/layers/quantization/quark/__init__.py b/vllm/model_executor/layers/quantization/quark/__init__.py similarity index 100% rename from model_executor/layers/quantization/quark/__init__.py rename to vllm/model_executor/layers/quantization/quark/__init__.py diff --git a/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py similarity index 100% rename from model_executor/layers/quantization/quark/quark.py rename to vllm/model_executor/layers/quantization/quark/quark.py diff --git a/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py similarity index 100% rename from model_executor/layers/quantization/quark/quark_moe.py rename to vllm/model_executor/layers/quantization/quark/quark_moe.py diff --git a/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py similarity index 100% rename from model_executor/layers/quantization/quark/schemes/__init__.py rename to vllm/model_executor/layers/quantization/quark/schemes/__init__.py diff --git a/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py similarity index 100% rename from model_executor/layers/quantization/quark/schemes/quark_scheme.py rename to vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py diff --git a/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py similarity index 100% rename from model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py rename to vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py diff --git a/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py similarity index 100% rename from model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py rename to vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py diff --git a/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py similarity index 100% rename from model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py rename to vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py diff --git a/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py similarity index 100% rename from model_executor/layers/quantization/quark/utils.py rename to vllm/model_executor/layers/quantization/quark/utils.py diff --git a/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py similarity index 100% rename from model_executor/layers/quantization/schema.py rename to vllm/model_executor/layers/quantization/schema.py diff --git a/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py similarity index 100% rename from model_executor/layers/quantization/torchao.py rename to vllm/model_executor/layers/quantization/torchao.py diff --git a/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py similarity index 100% rename from model_executor/layers/quantization/tpu_int8.py rename to vllm/model_executor/layers/quantization/tpu_int8.py diff --git a/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py similarity index 100% rename from model_executor/layers/quantization/utils/__init__.py rename to vllm/model_executor/layers/quantization/utils/__init__.py diff --git a/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/allspark_utils.py rename to vllm/model_executor/layers/quantization/utils/allspark_utils.py diff --git a/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/bitblas_utils.py rename to vllm/model_executor/layers/quantization/utils/bitblas_utils.py diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/fp8_utils.py rename to vllm/model_executor/layers/quantization/utils/fp8_utils.py diff --git a/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/gptq_utils.py rename to vllm/model_executor/layers/quantization/utils/gptq_utils.py diff --git a/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/int8_utils.py rename to vllm/model_executor/layers/quantization/utils/int8_utils.py diff --git a/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/layer_utils.py rename to vllm/model_executor/layers/quantization/utils/layer_utils.py diff --git a/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/machete_utils.py rename to vllm/model_executor/layers/quantization/utils/machete_utils.py diff --git a/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils.py diff --git a/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils_fp4.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py diff --git a/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils_fp8.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py diff --git a/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils_test.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test.py diff --git a/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils_test_24.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py diff --git a/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py similarity index 100% rename from model_executor/layers/quantization/utils/marlin_utils_test_qqq.py rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py diff --git a/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/mxfp4_utils.py rename to vllm/model_executor/layers/quantization/utils/mxfp4_utils.py diff --git a/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/nvfp4_emulation_utils.py rename to vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py diff --git a/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/quant_utils.py rename to vllm/model_executor/layers/quantization/utils/quant_utils.py diff --git a/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py similarity index 100% rename from model_executor/layers/quantization/utils/w8a8_utils.py rename to vllm/model_executor/layers/quantization/utils/w8a8_utils.py diff --git a/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py similarity index 100% rename from model_executor/layers/rejection_sampler.py rename to vllm/model_executor/layers/rejection_sampler.py diff --git a/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py similarity index 100% rename from model_executor/layers/resampler.py rename to vllm/model_executor/layers/resampler.py diff --git a/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py similarity index 100% rename from model_executor/layers/rotary_embedding.py rename to vllm/model_executor/layers/rotary_embedding.py diff --git a/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py similarity index 100% rename from model_executor/layers/sampler.py rename to vllm/model_executor/layers/sampler.py diff --git a/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py similarity index 100% rename from model_executor/layers/spec_decode_base_sampler.py rename to vllm/model_executor/layers/spec_decode_base_sampler.py diff --git a/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py similarity index 100% rename from model_executor/layers/typical_acceptance_sampler.py rename to vllm/model_executor/layers/typical_acceptance_sampler.py diff --git a/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py similarity index 100% rename from model_executor/layers/utils.py rename to vllm/model_executor/layers/utils.py diff --git a/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py similarity index 100% rename from model_executor/layers/vocab_parallel_embedding.py rename to vllm/model_executor/layers/vocab_parallel_embedding.py diff --git a/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py similarity index 100% rename from model_executor/model_loader/__init__.py rename to vllm/model_executor/model_loader/__init__.py diff --git a/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py similarity index 100% rename from model_executor/model_loader/base_loader.py rename to vllm/model_executor/model_loader/base_loader.py diff --git a/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py similarity index 100% rename from model_executor/model_loader/bitsandbytes_loader.py rename to vllm/model_executor/model_loader/bitsandbytes_loader.py diff --git a/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py similarity index 100% rename from model_executor/model_loader/default_loader.py rename to vllm/model_executor/model_loader/default_loader.py diff --git a/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py similarity index 100% rename from model_executor/model_loader/dummy_loader.py rename to vllm/model_executor/model_loader/dummy_loader.py diff --git a/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py similarity index 100% rename from model_executor/model_loader/gguf_loader.py rename to vllm/model_executor/model_loader/gguf_loader.py diff --git a/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py similarity index 100% rename from model_executor/model_loader/neuron.py rename to vllm/model_executor/model_loader/neuron.py diff --git a/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py similarity index 100% rename from model_executor/model_loader/neuronx_distributed.py rename to vllm/model_executor/model_loader/neuronx_distributed.py diff --git a/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py similarity index 100% rename from model_executor/model_loader/runai_streamer_loader.py rename to vllm/model_executor/model_loader/runai_streamer_loader.py diff --git a/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py similarity index 100% rename from model_executor/model_loader/sharded_state_loader.py rename to vllm/model_executor/model_loader/sharded_state_loader.py diff --git a/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py similarity index 100% rename from model_executor/model_loader/tensorizer.py rename to vllm/model_executor/model_loader/tensorizer.py diff --git a/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py similarity index 100% rename from model_executor/model_loader/tensorizer_loader.py rename to vllm/model_executor/model_loader/tensorizer_loader.py diff --git a/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py similarity index 100% rename from model_executor/model_loader/tpu.py rename to vllm/model_executor/model_loader/tpu.py diff --git a/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py similarity index 100% rename from model_executor/model_loader/utils.py rename to vllm/model_executor/model_loader/utils.py diff --git a/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py similarity index 100% rename from model_executor/model_loader/weight_utils.py rename to vllm/model_executor/model_loader/weight_utils.py diff --git a/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py similarity index 100% rename from model_executor/models/__init__.py rename to vllm/model_executor/models/__init__.py diff --git a/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py similarity index 100% rename from model_executor/models/adapters.py rename to vllm/model_executor/models/adapters.py diff --git a/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py similarity index 100% rename from model_executor/models/aimv2.py rename to vllm/model_executor/models/aimv2.py diff --git a/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py similarity index 100% rename from model_executor/models/arctic.py rename to vllm/model_executor/models/arctic.py diff --git a/model_executor/models/aria.py b/vllm/model_executor/models/aria.py similarity index 100% rename from model_executor/models/aria.py rename to vllm/model_executor/models/aria.py diff --git a/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py similarity index 100% rename from model_executor/models/aya_vision.py rename to vllm/model_executor/models/aya_vision.py diff --git a/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py similarity index 100% rename from model_executor/models/baichuan.py rename to vllm/model_executor/models/baichuan.py diff --git a/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py similarity index 100% rename from model_executor/models/bamba.py rename to vllm/model_executor/models/bamba.py diff --git a/model_executor/models/bart.py b/vllm/model_executor/models/bart.py similarity index 100% rename from model_executor/models/bart.py rename to vllm/model_executor/models/bart.py diff --git a/model_executor/models/bert.py b/vllm/model_executor/models/bert.py similarity index 100% rename from model_executor/models/bert.py rename to vllm/model_executor/models/bert.py diff --git a/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py similarity index 100% rename from model_executor/models/bert_with_rope.py rename to vllm/model_executor/models/bert_with_rope.py diff --git a/model_executor/models/blip.py b/vllm/model_executor/models/blip.py similarity index 100% rename from model_executor/models/blip.py rename to vllm/model_executor/models/blip.py diff --git a/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py similarity index 100% rename from model_executor/models/blip2.py rename to vllm/model_executor/models/blip2.py diff --git a/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py similarity index 100% rename from model_executor/models/bloom.py rename to vllm/model_executor/models/bloom.py diff --git a/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py similarity index 100% rename from model_executor/models/chameleon.py rename to vllm/model_executor/models/chameleon.py diff --git a/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py similarity index 100% rename from model_executor/models/chatglm.py rename to vllm/model_executor/models/chatglm.py diff --git a/model_executor/models/clip.py b/vllm/model_executor/models/clip.py similarity index 100% rename from model_executor/models/clip.py rename to vllm/model_executor/models/clip.py diff --git a/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py similarity index 100% rename from model_executor/models/commandr.py rename to vllm/model_executor/models/commandr.py diff --git a/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py similarity index 100% rename from model_executor/models/constant_size_cache.py rename to vllm/model_executor/models/constant_size_cache.py diff --git a/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py similarity index 100% rename from model_executor/models/dbrx.py rename to vllm/model_executor/models/dbrx.py diff --git a/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py similarity index 100% rename from model_executor/models/deepseek.py rename to vllm/model_executor/models/deepseek.py diff --git a/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py similarity index 100% rename from model_executor/models/deepseek_mtp.py rename to vllm/model_executor/models/deepseek_mtp.py diff --git a/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py similarity index 100% rename from model_executor/models/deepseek_v2.py rename to vllm/model_executor/models/deepseek_v2.py diff --git a/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py similarity index 100% rename from model_executor/models/deepseek_vl2.py rename to vllm/model_executor/models/deepseek_vl2.py diff --git a/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py similarity index 100% rename from model_executor/models/eagle.py rename to vllm/model_executor/models/eagle.py diff --git a/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py similarity index 100% rename from model_executor/models/ernie45.py rename to vllm/model_executor/models/ernie45.py diff --git a/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py similarity index 100% rename from model_executor/models/ernie45_moe.py rename to vllm/model_executor/models/ernie45_moe.py diff --git a/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py similarity index 100% rename from model_executor/models/exaone.py rename to vllm/model_executor/models/exaone.py diff --git a/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py similarity index 100% rename from model_executor/models/fairseq2_llama.py rename to vllm/model_executor/models/fairseq2_llama.py diff --git a/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py similarity index 100% rename from model_executor/models/falcon.py rename to vllm/model_executor/models/falcon.py diff --git a/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py similarity index 100% rename from model_executor/models/falcon_h1.py rename to vllm/model_executor/models/falcon_h1.py diff --git a/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py similarity index 100% rename from model_executor/models/florence2.py rename to vllm/model_executor/models/florence2.py diff --git a/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py similarity index 100% rename from model_executor/models/fuyu.py rename to vllm/model_executor/models/fuyu.py diff --git a/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py similarity index 100% rename from model_executor/models/gemma.py rename to vllm/model_executor/models/gemma.py diff --git a/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py similarity index 100% rename from model_executor/models/gemma2.py rename to vllm/model_executor/models/gemma2.py diff --git a/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py similarity index 100% rename from model_executor/models/gemma3.py rename to vllm/model_executor/models/gemma3.py diff --git a/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py similarity index 100% rename from model_executor/models/gemma3_mm.py rename to vllm/model_executor/models/gemma3_mm.py diff --git a/model_executor/models/glm.py b/vllm/model_executor/models/glm.py similarity index 100% rename from model_executor/models/glm.py rename to vllm/model_executor/models/glm.py diff --git a/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py similarity index 100% rename from model_executor/models/glm4.py rename to vllm/model_executor/models/glm4.py diff --git a/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py similarity index 100% rename from model_executor/models/glm4v.py rename to vllm/model_executor/models/glm4v.py diff --git a/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py similarity index 100% rename from model_executor/models/gpt2.py rename to vllm/model_executor/models/gpt2.py diff --git a/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py similarity index 100% rename from model_executor/models/gpt_bigcode.py rename to vllm/model_executor/models/gpt_bigcode.py diff --git a/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py similarity index 100% rename from model_executor/models/gpt_j.py rename to vllm/model_executor/models/gpt_j.py diff --git a/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py similarity index 100% rename from model_executor/models/gpt_neox.py rename to vllm/model_executor/models/gpt_neox.py diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py new file mode 100644 index 0000000..7d4293a --- /dev/null +++ b/vllm/model_executor/models/gpt_oss.py @@ -0,0 +1,618 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.distributed as dist +from torch import nn +from transformers import GptOssConfig + +from vllm import envs +from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.utils import cdiv + +from .utils import extract_layer_index, maybe_prefix + + +class OAIAttention(nn.Module): + + def __init__( + self, + config: GptOssConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + ): + super().__init__() + self.layer_idx = extract_layer_index(prefix) + self.head_dim = config.head_dim + self.num_attention_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.hidden_size = config.hidden_size + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=config.max_position_embeddings, + base=config.rope_theta, + dtype=torch.float32, + rope_scaling={ + "rope_type": + "yarn", + "factor": + config.rope_scaling["factor"], + "original_max_position_embeddings": + config.rope_scaling["original_max_position_embeddings"], + "beta_fast": + config.rope_scaling["beta_fast"], + "beta_slow": + config.rope_scaling["beta_slow"], + }, + is_neox_style=True, + ) + + tp_size = get_tensor_model_parallel_world_size() + + # attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION + # else torch.bfloat16) + attention_sink_dtype = torch.bfloat16 + self.sinks = torch.nn.Parameter( + torch.empty(config.num_attention_heads // tp_size, + dtype=attention_sink_dtype, + requires_grad=False)) + + self.norm = RMSNorm(config.hidden_size, eps=1e-5) + + self.q_size = self.num_attention_heads * self.head_dim // tp_size + self.kv_size = self.num_key_value_heads * self.head_dim // tp_size + self.scaling = self.head_dim**-0.5 + self.rope_theta = config.rope_theta + + self.qkv = QKVParallelLinear( + hidden_size=self.hidden_size, + head_size=self.head_dim, + total_num_heads=self.num_attention_heads, + total_num_kv_heads=self.num_key_value_heads, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.num_attention_heads * self.head_dim, + output_size=self.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.num_local_attention_heads = config.num_attention_heads // tp_size + self.num_local_key_value_heads = config.num_key_value_heads // tp_size + + # Only apply sliding window to every other layer + sliding_window = (config.sliding_window if self.layer_idx % + 2 == 0 else None) + self.attn = Attention( + self.num_local_attention_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_local_key_value_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + attn_type=AttentionType.DECODER, + prefix=f"{prefix}.attn", + sinks=self.sinks, + ) + + def forward(self, hidden_states: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + t = self.norm(hidden_states) + + qkv, _ = self.qkv(t) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + v = v.contiguous() + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + + return output + hidden_states + + +class MLPBlock(torch.nn.Module): + + def __init__( + self, + config: GptOssConfig, + layer_idx: int, + quant_config: QuantizationConfig, + prefix: str = "", + ): + super().__init__() + self.layer_idx = layer_idx + self.num_experts = config.num_local_experts + self.experts_per_token = config.num_experts_per_tok + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + self.norm = RMSNorm(config.hidden_size, eps=1e-5) + self.router = torch.nn.Linear(config.hidden_size, + config.num_local_experts, + dtype=torch.bfloat16) + assert config.intermediate_size % self.world_size == 0 + self.experts = FusedMoE(num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=True, + renormalize=True, + quant_config=quant_config, + prefix=f"{prefix}.experts", + apply_router_weight_on_input=False, + has_bias=True, + activation="swiglu_oai") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + t = self.norm(x) + g = self.router(t) + t = self.experts(hidden_states=t, router_logits=g) + return x + t + + +class TransformerBlock(torch.nn.Module): + + def __init__( + self, + config: GptOssConfig, + quant_config: QuantizationConfig, + prefix: str = "", + ): + super().__init__() + self.layer_idx = extract_layer_index(prefix) + self.attn = OAIAttention(config, prefix=f"{prefix}.attn") + self.mlp = MLPBlock(config, + self.layer_idx, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward(self, hidden_states: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + attn_output = self.attn(hidden_states, positions) + output = self.mlp(attn_output) + return output + + +@support_torch_compile +class GptOssModel(nn.Module): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + self.config = vllm_config.model_config.hf_config + self.quant_config = vllm_config.quant_config + self.config.hidden_size = self.config.hidden_size + self.embedding = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + ) + self.layers = torch.nn.ModuleList([ + TransformerBlock( + self.config, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, f"block.{layer_idx}"), + ) for layer_idx in range(self.config.num_hidden_layers) + ]) + self.norm = RMSNorm(self.config.hidden_size, eps=1e-5) + + def forward(self, input_ids: torch.Tensor, + positions: torch.Tensor) -> torch.Tensor: + x = self.embedding(input_ids) + for layer in self.layers: + x = layer(x, positions) + x = self.norm(x) + return x + + +class GptOssForCausalLM(nn.Module): + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config.hf_config + self.model = GptOssModel( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + self.lm_head = ParallelLMHead( + self.model_config.vocab_size, + self.model_config.hidden_size, + ) + self.logits_processor = LogitsProcessor(self.model_config.vocab_size) + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + assert intermediate_tensors is None + assert inputs_embeds is None + return self.model(input_ids, positions) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def _load_weights_mxfp4( + self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + rename_mapping = { + "self_attn": "attn", + "input_layernorm.weight": "attn.norm.weight", + "post_attention_layernorm.weight": "mlp.norm.weight", + "embed_tokens": "embedding", + } + + def maybe_rename(name: str) -> str: + for remap_name, new_name in rename_mapping.items(): + if remap_name in name: + return name.replace(remap_name, new_name) + return name + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + mxfp4_block = 32 + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + intermediate_size = self.model_config.intermediate_size + intermediate_size_block = intermediate_size // mxfp4_block + per_rank_intermediate_size_block = cdiv(intermediate_size_block, + tp_size) + per_rank_intermediate_size = (per_rank_intermediate_size_block * + mxfp4_block) + + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + # Attention heads per rank + heads_per_rank = self.model_config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + use_ep = self.vllm_config.parallel_config.enable_expert_parallel + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.model_config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + for name, weight in weights: + # FIXME(woosuk): Remove this after testing. + weight = weight.cuda() + + if "gate_up_proj_blocks" in name: + # Handle MLP gate and up projection weights + new_name = name.replace("gate_up_proj_blocks", "w13_weight") + + # flat weight from (E, 2 * N, block_size, entry_per_block) + # to (E, 2 * N, -1), shouldn't trigger copy for contiguous + weight = weight.view(num_experts, 2 * intermediate_size, + -1).contiguous() + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_blocks" in name: + # Handle MLP down projection weights + new_name = name.replace("down_proj_blocks", "w2_weight") + # same flatten here, but since 2 mx4 value are packed in 1 + # uint8, divide by 2 + weight = weight.view(num_experts, -1, + intermediate_size // 2).contiguous() + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., + tp_rank_start // 2:tp_rank_end // 2] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "gate_up_proj_scales" in name: + # Handle MLP gate and up projection weights scale + new_name = name.replace("gate_up_proj_scales", + "w13_weight_scale") + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_scales" in name: + # Handle MLP down projection weights + new_name = name.replace("down_proj_scales", "w2_weight_scale") + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., tp_rank_start // + mxfp4_block:tp_rank_end // + mxfp4_block] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + elif "gate_up_proj_bias" in name: + # Handle MLP gate and up projection biases + new_name = name.replace("gate_up_proj_bias", "w13_bias") + + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + + elif "down_proj_bias" in name: + # Handle MLP down projection bias + new_name = name.replace("down_proj_bias", "w2_bias") + param = params_dict[new_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + weight_loader(param, + weight, + weight_name=new_name, + shard_id=None, + expert_id=None) + loaded_params.add(new_name) + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + name = name.replace("self_attn", "attn") + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + elif "q_proj" in name or "k_proj" in name or "v_proj" in name: + shard_id = ("q" if "q_proj" in name else + "k" if "k_proj" in name else "v") + name = name.replace("self_attn", "attn") + param_name = name.replace(f"{shard_id}_proj", "qkv") + param = params_dict[param_name] + weight_loader = param.weight_loader + weight_loader(param, weight, loaded_shard_id=shard_id) + loaded_params.add(param_name) + else: + # Handle all other weights with potential renaming + renamed_name = maybe_rename(name) + if renamed_name not in params_dict: + continue + param = params_dict[renamed_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(renamed_name) + + return loaded_params + + def _load_weights_other( + self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + rename_mapping = { + "self_attn": "attn", + "input_layernorm.weight": "attn.norm.weight", + "post_attention_layernorm.weight": "mlp.norm.weight", + "embed_tokens": "embedding", + } + + def maybe_rename(name: str) -> str: + for remap_name, new_name in rename_mapping.items(): + if remap_name in name: + return name.replace(remap_name, new_name) + return name + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + intermediate_size = self.model_config.intermediate_size + + per_rank_intermediate_size = cdiv(intermediate_size, tp_size) + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + # Attention heads per rank + heads_per_rank = self.model_config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + use_ep = self.vllm_config.parallel_config.enable_expert_parallel + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.model_config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + for name, weight in weights: + if ".experts.gate_up_proj" in name and "bias" not in name: + # Handle MLP gate and up projection weights + new_name = name.replace(".experts.gate_up_proj", + ".experts.w13_weight") + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, :, + 2 * tp_rank_start:2 * tp_rank_end] + + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif ".experts.down_proj" in name and "bias" not in name: + # Handle MLP down projection weights + new_name = name.replace(".experts.down_proj", + ".experts.w2_weight") + + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif "gate_up_proj_bias" in name: + # Handle MLP gate and up projection biases + new_name = name.replace("gate_up_proj_bias", "w13_bias") + + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[new_name] + + param.copy_(narrow_weight) + loaded_params.add(new_name) + + elif "down_proj_bias" in name: + # Handle MLP down projection bias + new_name = name.replace("down_proj_bias", "w2_bias") + + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + param = params_dict[new_name] + param.copy_(weight) + loaded_params.add(new_name) + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + name = name.replace("self_attn", "attn") + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + elif "q_proj" in name or "k_proj" in name or "v_proj" in name: + shard_id = ("q" if "q_proj" in name else + "k" if "k_proj" in name else "v") + name = name.replace("self_attn", "attn") + param_name = name.replace(f"{shard_id}_proj", "qkv") + param = params_dict[param_name] + weight_loader = param.weight_loader + weight_loader(param, weight, loaded_shard_id=shard_id) + loaded_params.add(param_name) + else: + # Handle all other weights with potential renaming + + renamed_name = maybe_rename(name) + if renamed_name not in params_dict: + continue + param = params_dict[renamed_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(renamed_name) + + return loaded_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + quant_method = (self.model_config.quantization_config['quant_method'] + if hasattr(self.model_config, "quantization_config") + else None) + if quant_method == "mxfp4": + return self._load_weights_mxfp4(weights) + else: + return self._load_weights_other(weights) diff --git a/model_executor/models/granite.py b/vllm/model_executor/models/granite.py similarity index 100% rename from model_executor/models/granite.py rename to vllm/model_executor/models/granite.py diff --git a/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py similarity index 100% rename from model_executor/models/granite_speech.py rename to vllm/model_executor/models/granite_speech.py diff --git a/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py similarity index 100% rename from model_executor/models/granitemoe.py rename to vllm/model_executor/models/granitemoe.py diff --git a/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py similarity index 100% rename from model_executor/models/granitemoehybrid.py rename to vllm/model_executor/models/granitemoehybrid.py diff --git a/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py similarity index 100% rename from model_executor/models/granitemoeshared.py rename to vllm/model_executor/models/granitemoeshared.py diff --git a/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py similarity index 100% rename from model_executor/models/gritlm.py rename to vllm/model_executor/models/gritlm.py diff --git a/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py similarity index 100% rename from model_executor/models/grok1.py rename to vllm/model_executor/models/grok1.py diff --git a/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py similarity index 100% rename from model_executor/models/h2ovl.py rename to vllm/model_executor/models/h2ovl.py diff --git a/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py similarity index 100% rename from model_executor/models/idefics2_vision_model.py rename to vllm/model_executor/models/idefics2_vision_model.py diff --git a/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py similarity index 100% rename from model_executor/models/idefics3.py rename to vllm/model_executor/models/idefics3.py diff --git a/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py similarity index 100% rename from model_executor/models/interfaces.py rename to vllm/model_executor/models/interfaces.py diff --git a/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py similarity index 100% rename from model_executor/models/interfaces_base.py rename to vllm/model_executor/models/interfaces_base.py diff --git a/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py similarity index 100% rename from model_executor/models/intern_vit.py rename to vllm/model_executor/models/intern_vit.py diff --git a/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py similarity index 100% rename from model_executor/models/internlm2.py rename to vllm/model_executor/models/internlm2.py diff --git a/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py similarity index 100% rename from model_executor/models/internlm2_ve.py rename to vllm/model_executor/models/internlm2_ve.py diff --git a/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py similarity index 100% rename from model_executor/models/internvl.py rename to vllm/model_executor/models/internvl.py diff --git a/model_executor/models/jais.py b/vllm/model_executor/models/jais.py similarity index 100% rename from model_executor/models/jais.py rename to vllm/model_executor/models/jais.py diff --git a/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py similarity index 100% rename from model_executor/models/jamba.py rename to vllm/model_executor/models/jamba.py diff --git a/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py similarity index 100% rename from model_executor/models/kimi_vl.py rename to vllm/model_executor/models/kimi_vl.py diff --git a/model_executor/models/llama.py b/vllm/model_executor/models/llama.py similarity index 100% rename from model_executor/models/llama.py rename to vllm/model_executor/models/llama.py diff --git a/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py similarity index 100% rename from model_executor/models/llama4.py rename to vllm/model_executor/models/llama4.py diff --git a/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py similarity index 100% rename from model_executor/models/llama_eagle.py rename to vllm/model_executor/models/llama_eagle.py diff --git a/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py similarity index 100% rename from model_executor/models/llama_eagle3.py rename to vllm/model_executor/models/llama_eagle3.py diff --git a/model_executor/models/llava.py b/vllm/model_executor/models/llava.py similarity index 100% rename from model_executor/models/llava.py rename to vllm/model_executor/models/llava.py diff --git a/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py similarity index 100% rename from model_executor/models/llava_next.py rename to vllm/model_executor/models/llava_next.py diff --git a/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py similarity index 100% rename from model_executor/models/llava_next_video.py rename to vllm/model_executor/models/llava_next_video.py diff --git a/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py similarity index 100% rename from model_executor/models/llava_onevision.py rename to vllm/model_executor/models/llava_onevision.py diff --git a/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py similarity index 100% rename from model_executor/models/mamba.py rename to vllm/model_executor/models/mamba.py diff --git a/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py similarity index 100% rename from model_executor/models/mamba2.py rename to vllm/model_executor/models/mamba2.py diff --git a/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py similarity index 100% rename from model_executor/models/mamba_cache.py rename to vllm/model_executor/models/mamba_cache.py diff --git a/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py similarity index 100% rename from model_executor/models/medusa.py rename to vllm/model_executor/models/medusa.py diff --git a/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py similarity index 100% rename from model_executor/models/mimo.py rename to vllm/model_executor/models/mimo.py diff --git a/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py similarity index 100% rename from model_executor/models/mimo_mtp.py rename to vllm/model_executor/models/mimo_mtp.py diff --git a/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py similarity index 100% rename from model_executor/models/minicpm.py rename to vllm/model_executor/models/minicpm.py diff --git a/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py similarity index 100% rename from model_executor/models/minicpm3.py rename to vllm/model_executor/models/minicpm3.py diff --git a/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py similarity index 100% rename from model_executor/models/minicpm_eagle.py rename to vllm/model_executor/models/minicpm_eagle.py diff --git a/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py similarity index 100% rename from model_executor/models/minicpmo.py rename to vllm/model_executor/models/minicpmo.py diff --git a/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py similarity index 100% rename from model_executor/models/minicpmv.py rename to vllm/model_executor/models/minicpmv.py diff --git a/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py similarity index 100% rename from model_executor/models/minimax_cache.py rename to vllm/model_executor/models/minimax_cache.py diff --git a/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py similarity index 100% rename from model_executor/models/minimax_text_01.py rename to vllm/model_executor/models/minimax_text_01.py diff --git a/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py similarity index 100% rename from model_executor/models/minimax_vl_01.py rename to vllm/model_executor/models/minimax_vl_01.py diff --git a/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py similarity index 100% rename from model_executor/models/mistral3.py rename to vllm/model_executor/models/mistral3.py diff --git a/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py similarity index 100% rename from model_executor/models/mixtral.py rename to vllm/model_executor/models/mixtral.py diff --git a/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py similarity index 100% rename from model_executor/models/mixtral_quant.py rename to vllm/model_executor/models/mixtral_quant.py diff --git a/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py similarity index 100% rename from model_executor/models/mllama.py rename to vllm/model_executor/models/mllama.py diff --git a/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py similarity index 100% rename from model_executor/models/mllama4.py rename to vllm/model_executor/models/mllama4.py diff --git a/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py similarity index 100% rename from model_executor/models/mlp_speculator.py rename to vllm/model_executor/models/mlp_speculator.py diff --git a/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py similarity index 100% rename from model_executor/models/modernbert.py rename to vllm/model_executor/models/modernbert.py diff --git a/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py similarity index 100% rename from model_executor/models/module_mapping.py rename to vllm/model_executor/models/module_mapping.py diff --git a/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py similarity index 100% rename from model_executor/models/molmo.py rename to vllm/model_executor/models/molmo.py diff --git a/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py similarity index 100% rename from model_executor/models/moonvit.py rename to vllm/model_executor/models/moonvit.py diff --git a/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py similarity index 100% rename from model_executor/models/mpt.py rename to vllm/model_executor/models/mpt.py diff --git a/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py similarity index 100% rename from model_executor/models/nemotron.py rename to vllm/model_executor/models/nemotron.py diff --git a/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py similarity index 100% rename from model_executor/models/nemotron_h.py rename to vllm/model_executor/models/nemotron_h.py diff --git a/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py similarity index 100% rename from model_executor/models/nemotron_nas.py rename to vllm/model_executor/models/nemotron_nas.py diff --git a/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py similarity index 100% rename from model_executor/models/nvlm_d.py rename to vllm/model_executor/models/nvlm_d.py diff --git a/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py similarity index 100% rename from model_executor/models/olmo.py rename to vllm/model_executor/models/olmo.py diff --git a/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py similarity index 100% rename from model_executor/models/olmo2.py rename to vllm/model_executor/models/olmo2.py diff --git a/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py similarity index 100% rename from model_executor/models/olmoe.py rename to vllm/model_executor/models/olmoe.py diff --git a/model_executor/models/opt.py b/vllm/model_executor/models/opt.py similarity index 100% rename from model_executor/models/opt.py rename to vllm/model_executor/models/opt.py diff --git a/model_executor/models/orion.py b/vllm/model_executor/models/orion.py similarity index 100% rename from model_executor/models/orion.py rename to vllm/model_executor/models/orion.py diff --git a/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py similarity index 100% rename from model_executor/models/ovis.py rename to vllm/model_executor/models/ovis.py diff --git a/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py similarity index 100% rename from model_executor/models/paligemma.py rename to vllm/model_executor/models/paligemma.py diff --git a/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py similarity index 100% rename from model_executor/models/persimmon.py rename to vllm/model_executor/models/persimmon.py diff --git a/model_executor/models/phi.py b/vllm/model_executor/models/phi.py similarity index 100% rename from model_executor/models/phi.py rename to vllm/model_executor/models/phi.py diff --git a/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py similarity index 100% rename from model_executor/models/phi3.py rename to vllm/model_executor/models/phi3.py diff --git a/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py similarity index 100% rename from model_executor/models/phi3_small.py rename to vllm/model_executor/models/phi3_small.py diff --git a/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py similarity index 100% rename from model_executor/models/phi3v.py rename to vllm/model_executor/models/phi3v.py diff --git a/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py similarity index 100% rename from model_executor/models/phi4mm.py rename to vllm/model_executor/models/phi4mm.py diff --git a/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py similarity index 100% rename from model_executor/models/phi4mm_audio.py rename to vllm/model_executor/models/phi4mm_audio.py diff --git a/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py similarity index 100% rename from model_executor/models/phi4mm_utils.py rename to vllm/model_executor/models/phi4mm_utils.py diff --git a/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py similarity index 100% rename from model_executor/models/phimoe.py rename to vllm/model_executor/models/phimoe.py diff --git a/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py similarity index 100% rename from model_executor/models/pixtral.py rename to vllm/model_executor/models/pixtral.py diff --git a/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py similarity index 100% rename from model_executor/models/plamo2.py rename to vllm/model_executor/models/plamo2.py diff --git a/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py similarity index 100% rename from model_executor/models/prithvi_geospatial_mae.py rename to vllm/model_executor/models/prithvi_geospatial_mae.py diff --git a/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py similarity index 100% rename from model_executor/models/qwen.py rename to vllm/model_executor/models/qwen.py diff --git a/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py similarity index 100% rename from model_executor/models/qwen2.py rename to vllm/model_executor/models/qwen2.py diff --git a/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py similarity index 100% rename from model_executor/models/qwen2_5_omni_thinker.py rename to vllm/model_executor/models/qwen2_5_omni_thinker.py diff --git a/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py similarity index 100% rename from model_executor/models/qwen2_5_vl.py rename to vllm/model_executor/models/qwen2_5_vl.py diff --git a/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py similarity index 100% rename from model_executor/models/qwen2_audio.py rename to vllm/model_executor/models/qwen2_audio.py diff --git a/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py similarity index 100% rename from model_executor/models/qwen2_moe.py rename to vllm/model_executor/models/qwen2_moe.py diff --git a/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py similarity index 100% rename from model_executor/models/qwen2_rm.py rename to vllm/model_executor/models/qwen2_rm.py diff --git a/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py similarity index 100% rename from model_executor/models/qwen2_vl.py rename to vllm/model_executor/models/qwen2_vl.py diff --git a/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py similarity index 100% rename from model_executor/models/qwen3.py rename to vllm/model_executor/models/qwen3.py diff --git a/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py similarity index 100% rename from model_executor/models/qwen3_moe.py rename to vllm/model_executor/models/qwen3_moe.py diff --git a/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py similarity index 100% rename from model_executor/models/qwen_vl.py rename to vllm/model_executor/models/qwen_vl.py diff --git a/model_executor/models/registry.py b/vllm/model_executor/models/registry.py similarity index 99% rename from model_executor/models/registry.py rename to vllm/model_executor/models/registry.py index 6703c35..d8152fc 100644 --- a/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -61,6 +61,7 @@ _TEXT_GENERATION_MODELS = { "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), + "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), diff --git a/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py similarity index 100% rename from model_executor/models/roberta.py rename to vllm/model_executor/models/roberta.py diff --git a/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py similarity index 100% rename from model_executor/models/siglip.py rename to vllm/model_executor/models/siglip.py diff --git a/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py similarity index 100% rename from model_executor/models/skyworkr1v.py rename to vllm/model_executor/models/skyworkr1v.py diff --git a/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py similarity index 100% rename from model_executor/models/smolvlm.py rename to vllm/model_executor/models/smolvlm.py diff --git a/model_executor/models/solar.py b/vllm/model_executor/models/solar.py similarity index 100% rename from model_executor/models/solar.py rename to vllm/model_executor/models/solar.py diff --git a/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py similarity index 100% rename from model_executor/models/stablelm.py rename to vllm/model_executor/models/stablelm.py diff --git a/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py similarity index 100% rename from model_executor/models/starcoder2.py rename to vllm/model_executor/models/starcoder2.py diff --git a/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py similarity index 100% rename from model_executor/models/tarsier.py rename to vllm/model_executor/models/tarsier.py diff --git a/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py similarity index 100% rename from model_executor/models/telechat2.py rename to vllm/model_executor/models/telechat2.py diff --git a/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py similarity index 100% rename from model_executor/models/teleflm.py rename to vllm/model_executor/models/teleflm.py diff --git a/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py similarity index 100% rename from model_executor/models/transformers.py rename to vllm/model_executor/models/transformers.py diff --git a/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py similarity index 100% rename from model_executor/models/ultravox.py rename to vllm/model_executor/models/ultravox.py diff --git a/model_executor/models/utils.py b/vllm/model_executor/models/utils.py similarity index 100% rename from model_executor/models/utils.py rename to vllm/model_executor/models/utils.py diff --git a/model_executor/models/vision.py b/vllm/model_executor/models/vision.py similarity index 100% rename from model_executor/models/vision.py rename to vllm/model_executor/models/vision.py diff --git a/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py similarity index 100% rename from model_executor/models/whisper.py rename to vllm/model_executor/models/whisper.py diff --git a/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py similarity index 100% rename from model_executor/models/zamba2.py rename to vllm/model_executor/models/zamba2.py diff --git a/model_executor/parameter.py b/vllm/model_executor/parameter.py similarity index 100% rename from model_executor/parameter.py rename to vllm/model_executor/parameter.py diff --git a/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py similarity index 100% rename from model_executor/pooling_metadata.py rename to vllm/model_executor/pooling_metadata.py diff --git a/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py similarity index 100% rename from model_executor/sampling_metadata.py rename to vllm/model_executor/sampling_metadata.py diff --git a/model_executor/utils.py b/vllm/model_executor/utils.py similarity index 100% rename from model_executor/utils.py rename to vllm/model_executor/utils.py diff --git a/multimodal/__init__.py b/vllm/multimodal/__init__.py similarity index 100% rename from multimodal/__init__.py rename to vllm/multimodal/__init__.py diff --git a/multimodal/audio.py b/vllm/multimodal/audio.py similarity index 100% rename from multimodal/audio.py rename to vllm/multimodal/audio.py diff --git a/multimodal/base.py b/vllm/multimodal/base.py similarity index 100% rename from multimodal/base.py rename to vllm/multimodal/base.py diff --git a/multimodal/hasher.py b/vllm/multimodal/hasher.py similarity index 100% rename from multimodal/hasher.py rename to vllm/multimodal/hasher.py diff --git a/multimodal/image.py b/vllm/multimodal/image.py similarity index 100% rename from multimodal/image.py rename to vllm/multimodal/image.py diff --git a/multimodal/inputs.py b/vllm/multimodal/inputs.py similarity index 100% rename from multimodal/inputs.py rename to vllm/multimodal/inputs.py diff --git a/multimodal/parse.py b/vllm/multimodal/parse.py similarity index 100% rename from multimodal/parse.py rename to vllm/multimodal/parse.py diff --git a/multimodal/processing.py b/vllm/multimodal/processing.py similarity index 100% rename from multimodal/processing.py rename to vllm/multimodal/processing.py diff --git a/multimodal/profiling.py b/vllm/multimodal/profiling.py similarity index 100% rename from multimodal/profiling.py rename to vllm/multimodal/profiling.py diff --git a/multimodal/registry.py b/vllm/multimodal/registry.py similarity index 100% rename from multimodal/registry.py rename to vllm/multimodal/registry.py diff --git a/multimodal/utils.py b/vllm/multimodal/utils.py similarity index 100% rename from multimodal/utils.py rename to vllm/multimodal/utils.py diff --git a/multimodal/video.py b/vllm/multimodal/video.py similarity index 100% rename from multimodal/video.py rename to vllm/multimodal/video.py diff --git a/outputs.py b/vllm/outputs.py similarity index 100% rename from outputs.py rename to vllm/outputs.py diff --git a/platforms/__init__.py b/vllm/platforms/__init__.py similarity index 100% rename from platforms/__init__.py rename to vllm/platforms/__init__.py diff --git a/platforms/cpu.py b/vllm/platforms/cpu.py similarity index 100% rename from platforms/cpu.py rename to vllm/platforms/cpu.py diff --git a/platforms/cuda.py b/vllm/platforms/cuda.py similarity index 100% rename from platforms/cuda.py rename to vllm/platforms/cuda.py diff --git a/platforms/hpu.py b/vllm/platforms/hpu.py similarity index 100% rename from platforms/hpu.py rename to vllm/platforms/hpu.py diff --git a/platforms/interface.py b/vllm/platforms/interface.py similarity index 100% rename from platforms/interface.py rename to vllm/platforms/interface.py diff --git a/platforms/neuron.py b/vllm/platforms/neuron.py similarity index 100% rename from platforms/neuron.py rename to vllm/platforms/neuron.py diff --git a/platforms/rocm.py b/vllm/platforms/rocm.py similarity index 98% rename from platforms/rocm.py rename to vllm/platforms/rocm.py index a929366..da94f00 100644 --- a/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -126,7 +126,8 @@ def use_rocm_custom_paged_attention( max_seq_len: int, sliding_window: int, kv_cache_dtype: str, - alibi_slopes: Optional[torch.Tensor] = None) -> bool: + alibi_slopes: Optional[torch.Tensor] = None, + sinks: Optional[torch.Tensor] = None) -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) @@ -143,7 +144,7 @@ def use_rocm_custom_paged_attention( and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN - and envs.VLLM_ROCM_USE_AITER)) + and envs.VLLM_ROCM_USE_AITER) and sinks is None) else: return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0 @@ -153,7 +154,7 @@ def use_rocm_custom_paged_attention( and (gqa_ratio >= 3 and gqa_ratio <= 16) and max_seq_len <= 32768 and alibi_slopes is None and kv_cache_dtype == "auto" - and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) + and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN and sinks is None) class RocmPlatform(Platform): diff --git a/platforms/tpu.py b/vllm/platforms/tpu.py similarity index 100% rename from platforms/tpu.py rename to vllm/platforms/tpu.py diff --git a/platforms/xpu.py b/vllm/platforms/xpu.py similarity index 100% rename from platforms/xpu.py rename to vllm/platforms/xpu.py diff --git a/plugins/__init__.py b/vllm/plugins/__init__.py similarity index 100% rename from plugins/__init__.py rename to vllm/plugins/__init__.py diff --git a/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md similarity index 100% rename from plugins/lora_resolvers/README.md rename to vllm/plugins/lora_resolvers/README.md diff --git a/plugins/lora_resolvers/__init__.py b/vllm/plugins/lora_resolvers/__init__.py similarity index 100% rename from plugins/lora_resolvers/__init__.py rename to vllm/plugins/lora_resolvers/__init__.py diff --git a/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py similarity index 100% rename from plugins/lora_resolvers/filesystem_resolver.py rename to vllm/plugins/lora_resolvers/filesystem_resolver.py diff --git a/pooling_params.py b/vllm/pooling_params.py similarity index 100% rename from pooling_params.py rename to vllm/pooling_params.py diff --git a/profiler/__init__.py b/vllm/profiler/__init__.py similarity index 100% rename from profiler/__init__.py rename to vllm/profiler/__init__.py diff --git a/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py similarity index 100% rename from profiler/layerwise_profile.py rename to vllm/profiler/layerwise_profile.py diff --git a/profiler/utils.py b/vllm/profiler/utils.py similarity index 100% rename from profiler/utils.py rename to vllm/profiler/utils.py diff --git a/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py similarity index 100% rename from prompt_adapter/__init__.py rename to vllm/prompt_adapter/__init__.py diff --git a/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py similarity index 100% rename from prompt_adapter/layers.py rename to vllm/prompt_adapter/layers.py diff --git a/prompt_adapter/models.py b/vllm/prompt_adapter/models.py similarity index 100% rename from prompt_adapter/models.py rename to vllm/prompt_adapter/models.py diff --git a/prompt_adapter/request.py b/vllm/prompt_adapter/request.py similarity index 100% rename from prompt_adapter/request.py rename to vllm/prompt_adapter/request.py diff --git a/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py similarity index 100% rename from prompt_adapter/utils.py rename to vllm/prompt_adapter/utils.py diff --git a/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py similarity index 100% rename from prompt_adapter/worker_manager.py rename to vllm/prompt_adapter/worker_manager.py diff --git a/py.typed b/vllm/py.typed similarity index 100% rename from py.typed rename to vllm/py.typed diff --git a/reasoning/__init__.py b/vllm/reasoning/__init__.py similarity index 100% rename from reasoning/__init__.py rename to vllm/reasoning/__init__.py diff --git a/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py similarity index 100% rename from reasoning/abs_reasoning_parsers.py rename to vllm/reasoning/abs_reasoning_parsers.py diff --git a/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py similarity index 100% rename from reasoning/deepseek_r1_reasoning_parser.py rename to vllm/reasoning/deepseek_r1_reasoning_parser.py diff --git a/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py similarity index 100% rename from reasoning/granite_reasoning_parser.py rename to vllm/reasoning/granite_reasoning_parser.py diff --git a/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py similarity index 100% rename from reasoning/qwen3_reasoning_parser.py rename to vllm/reasoning/qwen3_reasoning_parser.py diff --git a/sampling_params.py b/vllm/sampling_params.py similarity index 100% rename from sampling_params.py rename to vllm/sampling_params.py diff --git a/scalar_type.py b/vllm/scalar_type.py similarity index 100% rename from scalar_type.py rename to vllm/scalar_type.py diff --git a/scripts.py b/vllm/scripts.py similarity index 100% rename from scripts.py rename to vllm/scripts.py diff --git a/sequence.py b/vllm/sequence.py similarity index 100% rename from sequence.py rename to vllm/sequence.py diff --git a/spec_decode/__init__.py b/vllm/spec_decode/__init__.py similarity index 100% rename from spec_decode/__init__.py rename to vllm/spec_decode/__init__.py diff --git a/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py similarity index 100% rename from spec_decode/batch_expansion.py rename to vllm/spec_decode/batch_expansion.py diff --git a/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py similarity index 100% rename from spec_decode/draft_model_runner.py rename to vllm/spec_decode/draft_model_runner.py diff --git a/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py similarity index 100% rename from spec_decode/interfaces.py rename to vllm/spec_decode/interfaces.py diff --git a/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py similarity index 100% rename from spec_decode/medusa_worker.py rename to vllm/spec_decode/medusa_worker.py diff --git a/spec_decode/metrics.py b/vllm/spec_decode/metrics.py similarity index 100% rename from spec_decode/metrics.py rename to vllm/spec_decode/metrics.py diff --git a/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py similarity index 100% rename from spec_decode/mlp_speculator_worker.py rename to vllm/spec_decode/mlp_speculator_worker.py diff --git a/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py similarity index 100% rename from spec_decode/mqa_scorer.py rename to vllm/spec_decode/mqa_scorer.py diff --git a/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py similarity index 100% rename from spec_decode/multi_step_worker.py rename to vllm/spec_decode/multi_step_worker.py diff --git a/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py similarity index 100% rename from spec_decode/ngram_worker.py rename to vllm/spec_decode/ngram_worker.py diff --git a/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py similarity index 100% rename from spec_decode/proposer_worker_base.py rename to vllm/spec_decode/proposer_worker_base.py diff --git a/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py similarity index 100% rename from spec_decode/smaller_tp_proposer_worker.py rename to vllm/spec_decode/smaller_tp_proposer_worker.py diff --git a/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py similarity index 100% rename from spec_decode/spec_decode_worker.py rename to vllm/spec_decode/spec_decode_worker.py diff --git a/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py similarity index 100% rename from spec_decode/target_model_runner.py rename to vllm/spec_decode/target_model_runner.py diff --git a/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py similarity index 100% rename from spec_decode/top1_proposer.py rename to vllm/spec_decode/top1_proposer.py diff --git a/spec_decode/util.py b/vllm/spec_decode/util.py similarity index 100% rename from spec_decode/util.py rename to vllm/spec_decode/util.py diff --git a/test_utils.py b/vllm/test_utils.py similarity index 100% rename from test_utils.py rename to vllm/test_utils.py diff --git a/third_party/__init__.py b/vllm/third_party/__init__.py similarity index 100% rename from third_party/__init__.py rename to vllm/third_party/__init__.py diff --git a/third_party/pymcml.py b/vllm/third_party/pymcml.py similarity index 100% rename from third_party/pymcml.py rename to vllm/third_party/pymcml.py diff --git a/third_party/pynvml.py b/vllm/third_party/pynvml.py similarity index 100% rename from third_party/pynvml.py rename to vllm/third_party/pynvml.py diff --git a/tracing.py b/vllm/tracing.py similarity index 100% rename from tracing.py rename to vllm/tracing.py diff --git a/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py similarity index 100% rename from transformers_utils/__init__.py rename to vllm/transformers_utils/__init__.py diff --git a/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py similarity index 100% rename from transformers_utils/chat_templates/__init__.py rename to vllm/transformers_utils/chat_templates/__init__.py diff --git a/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py similarity index 100% rename from transformers_utils/chat_templates/registry.py rename to vllm/transformers_utils/chat_templates/registry.py diff --git a/transformers_utils/chat_templates/template_basic.jinja b/vllm/transformers_utils/chat_templates/template_basic.jinja similarity index 100% rename from transformers_utils/chat_templates/template_basic.jinja rename to vllm/transformers_utils/chat_templates/template_basic.jinja diff --git a/transformers_utils/chat_templates/template_blip2.jinja b/vllm/transformers_utils/chat_templates/template_blip2.jinja similarity index 100% rename from transformers_utils/chat_templates/template_blip2.jinja rename to vllm/transformers_utils/chat_templates/template_blip2.jinja diff --git a/transformers_utils/chat_templates/template_chatml.jinja b/vllm/transformers_utils/chat_templates/template_chatml.jinja similarity index 100% rename from transformers_utils/chat_templates/template_chatml.jinja rename to vllm/transformers_utils/chat_templates/template_chatml.jinja diff --git a/transformers_utils/chat_templates/template_deepseek_vl2.jinja b/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja similarity index 100% rename from transformers_utils/chat_templates/template_deepseek_vl2.jinja rename to vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja diff --git a/transformers_utils/chat_templates/template_fuyu.jinja b/vllm/transformers_utils/chat_templates/template_fuyu.jinja similarity index 100% rename from transformers_utils/chat_templates/template_fuyu.jinja rename to vllm/transformers_utils/chat_templates/template_fuyu.jinja diff --git a/transformers_utils/config.py b/vllm/transformers_utils/config.py similarity index 100% rename from transformers_utils/config.py rename to vllm/transformers_utils/config.py diff --git a/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py similarity index 100% rename from transformers_utils/configs/__init__.py rename to vllm/transformers_utils/configs/__init__.py diff --git a/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py similarity index 100% rename from transformers_utils/configs/arctic.py rename to vllm/transformers_utils/configs/arctic.py diff --git a/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py similarity index 100% rename from transformers_utils/configs/chatglm.py rename to vllm/transformers_utils/configs/chatglm.py diff --git a/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py similarity index 100% rename from transformers_utils/configs/cohere2.py rename to vllm/transformers_utils/configs/cohere2.py diff --git a/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py similarity index 100% rename from transformers_utils/configs/dbrx.py rename to vllm/transformers_utils/configs/dbrx.py diff --git a/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py similarity index 100% rename from transformers_utils/configs/deepseek_vl2.py rename to vllm/transformers_utils/configs/deepseek_vl2.py diff --git a/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py similarity index 100% rename from transformers_utils/configs/eagle.py rename to vllm/transformers_utils/configs/eagle.py diff --git a/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py similarity index 100% rename from transformers_utils/configs/exaone.py rename to vllm/transformers_utils/configs/exaone.py diff --git a/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py similarity index 100% rename from transformers_utils/configs/falcon.py rename to vllm/transformers_utils/configs/falcon.py diff --git a/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py similarity index 100% rename from transformers_utils/configs/h2ovl.py rename to vllm/transformers_utils/configs/h2ovl.py diff --git a/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py similarity index 100% rename from transformers_utils/configs/internvl.py rename to vllm/transformers_utils/configs/internvl.py diff --git a/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py similarity index 100% rename from transformers_utils/configs/jais.py rename to vllm/transformers_utils/configs/jais.py diff --git a/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py similarity index 100% rename from transformers_utils/configs/kimi_vl.py rename to vllm/transformers_utils/configs/kimi_vl.py diff --git a/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py similarity index 100% rename from transformers_utils/configs/medusa.py rename to vllm/transformers_utils/configs/medusa.py diff --git a/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py similarity index 100% rename from transformers_utils/configs/minimax_text_01.py rename to vllm/transformers_utils/configs/minimax_text_01.py diff --git a/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py similarity index 100% rename from transformers_utils/configs/minimax_vl_01.py rename to vllm/transformers_utils/configs/minimax_vl_01.py diff --git a/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py similarity index 100% rename from transformers_utils/configs/mllama.py rename to vllm/transformers_utils/configs/mllama.py diff --git a/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py similarity index 100% rename from transformers_utils/configs/mlp_speculator.py rename to vllm/transformers_utils/configs/mlp_speculator.py diff --git a/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py similarity index 100% rename from transformers_utils/configs/moonvit.py rename to vllm/transformers_utils/configs/moonvit.py diff --git a/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py similarity index 100% rename from transformers_utils/configs/mpt.py rename to vllm/transformers_utils/configs/mpt.py diff --git a/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py similarity index 100% rename from transformers_utils/configs/nemotron.py rename to vllm/transformers_utils/configs/nemotron.py diff --git a/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py similarity index 100% rename from transformers_utils/configs/nemotron_h.py rename to vllm/transformers_utils/configs/nemotron_h.py diff --git a/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py similarity index 100% rename from transformers_utils/configs/nvlm_d.py rename to vllm/transformers_utils/configs/nvlm_d.py diff --git a/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py similarity index 99% rename from transformers_utils/configs/ovis.py rename to vllm/transformers_utils/configs/ovis.py index c2728f0..874aa1c 100644 --- a/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -73,7 +73,7 @@ IMAGE_TOKEN = "" IMAGE_ATOM_ID = -300 IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] -AutoConfig.register("aimv2", AIMv2Config) +AutoConfig.register("aimv2", AIMv2Config, exist_ok=True) # ---------------------------------------------------------------------- diff --git a/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py similarity index 100% rename from transformers_utils/configs/skyworkr1v.py rename to vllm/transformers_utils/configs/skyworkr1v.py diff --git a/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py similarity index 100% rename from transformers_utils/configs/solar.py rename to vllm/transformers_utils/configs/solar.py diff --git a/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py similarity index 100% rename from transformers_utils/configs/telechat2.py rename to vllm/transformers_utils/configs/telechat2.py diff --git a/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py similarity index 100% rename from transformers_utils/configs/ultravox.py rename to vllm/transformers_utils/configs/ultravox.py diff --git a/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py similarity index 100% rename from transformers_utils/detokenizer.py rename to vllm/transformers_utils/detokenizer.py diff --git a/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py similarity index 100% rename from transformers_utils/detokenizer_utils.py rename to vllm/transformers_utils/detokenizer_utils.py diff --git a/transformers_utils/processor.py b/vllm/transformers_utils/processor.py similarity index 100% rename from transformers_utils/processor.py rename to vllm/transformers_utils/processor.py diff --git a/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py similarity index 100% rename from transformers_utils/processors/__init__.py rename to vllm/transformers_utils/processors/__init__.py diff --git a/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py similarity index 100% rename from transformers_utils/processors/deepseek_vl2.py rename to vllm/transformers_utils/processors/deepseek_vl2.py diff --git a/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py similarity index 100% rename from transformers_utils/processors/ovis.py rename to vllm/transformers_utils/processors/ovis.py diff --git a/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py similarity index 100% rename from transformers_utils/s3_utils.py rename to vllm/transformers_utils/s3_utils.py diff --git a/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py similarity index 100% rename from transformers_utils/tokenizer.py rename to vllm/transformers_utils/tokenizer.py diff --git a/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py similarity index 100% rename from transformers_utils/tokenizer_base.py rename to vllm/transformers_utils/tokenizer_base.py diff --git a/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py similarity index 100% rename from transformers_utils/tokenizer_group.py rename to vllm/transformers_utils/tokenizer_group.py diff --git a/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py similarity index 100% rename from transformers_utils/tokenizers/__init__.py rename to vllm/transformers_utils/tokenizers/__init__.py diff --git a/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py similarity index 100% rename from transformers_utils/tokenizers/mistral.py rename to vllm/transformers_utils/tokenizers/mistral.py diff --git a/transformers_utils/utils.py b/vllm/transformers_utils/utils.py similarity index 100% rename from transformers_utils/utils.py rename to vllm/transformers_utils/utils.py diff --git a/triton_utils/__init__.py b/vllm/triton_utils/__init__.py similarity index 100% rename from triton_utils/__init__.py rename to vllm/triton_utils/__init__.py diff --git a/triton_utils/importing.py b/vllm/triton_utils/importing.py similarity index 100% rename from triton_utils/importing.py rename to vllm/triton_utils/importing.py diff --git a/usage/__init__.py b/vllm/usage/__init__.py similarity index 100% rename from usage/__init__.py rename to vllm/usage/__init__.py diff --git a/usage/usage_lib.py b/vllm/usage/usage_lib.py similarity index 100% rename from usage/usage_lib.py rename to vllm/usage/usage_lib.py diff --git a/utils.py b/vllm/utils.py similarity index 100% rename from utils.py rename to vllm/utils.py diff --git a/v1/__init__.py b/vllm/v1/__init__.py similarity index 100% rename from v1/__init__.py rename to vllm/v1/__init__.py diff --git a/v1/attention/__init__.py b/vllm/v1/attention/__init__.py similarity index 100% rename from v1/attention/__init__.py rename to vllm/v1/attention/__init__.py diff --git a/v1/attention/backends/__init__.py b/vllm/v1/attention/backends/__init__.py similarity index 100% rename from v1/attention/backends/__init__.py rename to vllm/v1/attention/backends/__init__.py diff --git a/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py similarity index 100% rename from v1/attention/backends/cpu_attn.py rename to vllm/v1/attention/backends/cpu_attn.py diff --git a/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py similarity index 100% rename from v1/attention/backends/flash_attn.py rename to vllm/v1/attention/backends/flash_attn.py diff --git a/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py similarity index 100% rename from v1/attention/backends/flashinfer.py rename to vllm/v1/attention/backends/flashinfer.py diff --git a/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py similarity index 100% rename from v1/attention/backends/flex_attention.py rename to vllm/v1/attention/backends/flex_attention.py diff --git a/v1/attention/backends/mla/__init__.py b/vllm/v1/attention/backends/mla/__init__.py similarity index 100% rename from v1/attention/backends/mla/__init__.py rename to vllm/v1/attention/backends/mla/__init__.py diff --git a/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py similarity index 100% rename from v1/attention/backends/mla/common.py rename to vllm/v1/attention/backends/mla/common.py diff --git a/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py similarity index 100% rename from v1/attention/backends/mla/cutlass_mla.py rename to vllm/v1/attention/backends/mla/cutlass_mla.py diff --git a/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py similarity index 100% rename from v1/attention/backends/mla/flashmla.py rename to vllm/v1/attention/backends/mla/flashmla.py diff --git a/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py similarity index 100% rename from v1/attention/backends/mla/rocm_aiter_mla.py rename to vllm/v1/attention/backends/mla/rocm_aiter_mla.py diff --git a/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py similarity index 100% rename from v1/attention/backends/mla/triton_mla.py rename to vllm/v1/attention/backends/mla/triton_mla.py diff --git a/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py similarity index 100% rename from v1/attention/backends/pallas.py rename to vllm/v1/attention/backends/pallas.py diff --git a/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py similarity index 95% rename from v1/attention/backends/triton_attn.py rename to vllm/v1/attention/backends/triton_attn.py index 5db592b..6a7c704 100644 --- a/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -90,6 +90,7 @@ class TritonAttentionImpl(AttentionImpl): attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, + sinks: Optional[torch.Tensor] = None, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -132,6 +133,13 @@ class TritonAttentionImpl(AttentionImpl): self.fp8_dtype = current_platform.fp8_dtype() self.force_prefill_decode_attn = \ envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + + self.sinks = sinks + if sinks is not None: + assert sinks.shape[0] == num_heads, ( + "Sinks must have the same number of heads as the number of " + f"heads in the layer. Sinks shape: {sinks.shape}, " + f"num_heads: {num_heads}.") def forward( self, @@ -257,7 +265,8 @@ class TritonAttentionImpl(AttentionImpl): v_scale=layer._v_scale, alibi_slopes=self.alibi_slopes, sliding_window=self.sliding_window[0], - sm_scale=self.scale) + sm_scale=self.scale, + sinks=self.sinks) else: descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) @@ -280,6 +289,7 @@ class TritonAttentionImpl(AttentionImpl): q_descale=None, # Not supported k_descale=layer._k_scale.expand(descale_shape), v_descale=layer._v_scale.expand(descale_shape), + sinks=self.sinks, ) return output diff --git a/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py similarity index 100% rename from v1/attention/backends/utils.py rename to vllm/v1/attention/backends/utils.py diff --git a/v1/core/__init__.py b/vllm/v1/core/__init__.py similarity index 100% rename from v1/core/__init__.py rename to vllm/v1/core/__init__.py diff --git a/v1/core/block_pool.py b/vllm/v1/core/block_pool.py similarity index 100% rename from v1/core/block_pool.py rename to vllm/v1/core/block_pool.py diff --git a/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py similarity index 100% rename from v1/core/encoder_cache_manager.py rename to vllm/v1/core/encoder_cache_manager.py diff --git a/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py similarity index 100% rename from v1/core/kv_cache_coordinator.py rename to vllm/v1/core/kv_cache_coordinator.py diff --git a/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py similarity index 100% rename from v1/core/kv_cache_manager.py rename to vllm/v1/core/kv_cache_manager.py diff --git a/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py similarity index 100% rename from v1/core/kv_cache_utils.py rename to vllm/v1/core/kv_cache_utils.py diff --git a/v1/core/sched/__init__.py b/vllm/v1/core/sched/__init__.py similarity index 100% rename from v1/core/sched/__init__.py rename to vllm/v1/core/sched/__init__.py diff --git a/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py similarity index 100% rename from v1/core/sched/interface.py rename to vllm/v1/core/sched/interface.py diff --git a/v1/core/sched/output.py b/vllm/v1/core/sched/output.py similarity index 100% rename from v1/core/sched/output.py rename to vllm/v1/core/sched/output.py diff --git a/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py similarity index 100% rename from v1/core/sched/scheduler.py rename to vllm/v1/core/sched/scheduler.py diff --git a/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py similarity index 100% rename from v1/core/sched/utils.py rename to vllm/v1/core/sched/utils.py diff --git a/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py similarity index 100% rename from v1/core/single_type_kv_cache_manager.py rename to vllm/v1/core/single_type_kv_cache_manager.py diff --git a/v1/engine/__init__.py b/vllm/v1/engine/__init__.py similarity index 100% rename from v1/engine/__init__.py rename to vllm/v1/engine/__init__.py diff --git a/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py similarity index 100% rename from v1/engine/async_llm.py rename to vllm/v1/engine/async_llm.py diff --git a/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py similarity index 100% rename from v1/engine/coordinator.py rename to vllm/v1/engine/coordinator.py diff --git a/v1/engine/core.py b/vllm/v1/engine/core.py similarity index 100% rename from v1/engine/core.py rename to vllm/v1/engine/core.py diff --git a/v1/engine/core_client.py b/vllm/v1/engine/core_client.py similarity index 100% rename from v1/engine/core_client.py rename to vllm/v1/engine/core_client.py diff --git a/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py similarity index 100% rename from v1/engine/detokenizer.py rename to vllm/v1/engine/detokenizer.py diff --git a/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py similarity index 100% rename from v1/engine/exceptions.py rename to vllm/v1/engine/exceptions.py diff --git a/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py similarity index 100% rename from v1/engine/llm_engine.py rename to vllm/v1/engine/llm_engine.py diff --git a/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py similarity index 100% rename from v1/engine/logprobs.py rename to vllm/v1/engine/logprobs.py diff --git a/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py similarity index 100% rename from v1/engine/mm_input_cache.py rename to vllm/v1/engine/mm_input_cache.py diff --git a/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py similarity index 100% rename from v1/engine/output_processor.py rename to vllm/v1/engine/output_processor.py diff --git a/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py similarity index 100% rename from v1/engine/parallel_sampling.py rename to vllm/v1/engine/parallel_sampling.py diff --git a/v1/engine/processor.py b/vllm/v1/engine/processor.py similarity index 100% rename from v1/engine/processor.py rename to vllm/v1/engine/processor.py diff --git a/v1/executor/__init__.py b/vllm/v1/executor/__init__.py similarity index 100% rename from v1/executor/__init__.py rename to vllm/v1/executor/__init__.py diff --git a/v1/executor/abstract.py b/vllm/v1/executor/abstract.py similarity index 100% rename from v1/executor/abstract.py rename to vllm/v1/executor/abstract.py diff --git a/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py similarity index 100% rename from v1/executor/multiproc_executor.py rename to vllm/v1/executor/multiproc_executor.py diff --git a/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py similarity index 100% rename from v1/executor/ray_distributed_executor.py rename to vllm/v1/executor/ray_distributed_executor.py diff --git a/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py similarity index 100% rename from v1/kv_cache_interface.py rename to vllm/v1/kv_cache_interface.py diff --git a/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py similarity index 100% rename from v1/metrics/__init__.py rename to vllm/v1/metrics/__init__.py diff --git a/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py similarity index 100% rename from v1/metrics/loggers.py rename to vllm/v1/metrics/loggers.py diff --git a/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py similarity index 100% rename from v1/metrics/prometheus.py rename to vllm/v1/metrics/prometheus.py diff --git a/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py similarity index 100% rename from v1/metrics/ray_wrappers.py rename to vllm/v1/metrics/ray_wrappers.py diff --git a/v1/metrics/reader.py b/vllm/v1/metrics/reader.py similarity index 100% rename from v1/metrics/reader.py rename to vllm/v1/metrics/reader.py diff --git a/v1/metrics/stats.py b/vllm/v1/metrics/stats.py similarity index 100% rename from v1/metrics/stats.py rename to vllm/v1/metrics/stats.py diff --git a/v1/outputs.py b/vllm/v1/outputs.py similarity index 100% rename from v1/outputs.py rename to vllm/v1/outputs.py diff --git a/v1/request.py b/vllm/v1/request.py similarity index 100% rename from v1/request.py rename to vllm/v1/request.py diff --git a/v1/sample/__init__.py b/vllm/v1/sample/__init__.py similarity index 100% rename from v1/sample/__init__.py rename to vllm/v1/sample/__init__.py diff --git a/v1/sample/metadata.py b/vllm/v1/sample/metadata.py similarity index 100% rename from v1/sample/metadata.py rename to vllm/v1/sample/metadata.py diff --git a/v1/sample/ops/__init__.py b/vllm/v1/sample/ops/__init__.py similarity index 100% rename from v1/sample/ops/__init__.py rename to vllm/v1/sample/ops/__init__.py diff --git a/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py similarity index 100% rename from v1/sample/ops/bad_words.py rename to vllm/v1/sample/ops/bad_words.py diff --git a/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py similarity index 100% rename from v1/sample/ops/penalties.py rename to vllm/v1/sample/ops/penalties.py diff --git a/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py similarity index 100% rename from v1/sample/ops/topk_topp_sampler.py rename to vllm/v1/sample/ops/topk_topp_sampler.py diff --git a/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py similarity index 100% rename from v1/sample/rejection_sampler.py rename to vllm/v1/sample/rejection_sampler.py diff --git a/v1/sample/sampler.py b/vllm/v1/sample/sampler.py similarity index 100% rename from v1/sample/sampler.py rename to vllm/v1/sample/sampler.py diff --git a/v1/sample/tpu/__init__.py b/vllm/v1/sample/tpu/__init__.py similarity index 100% rename from v1/sample/tpu/__init__.py rename to vllm/v1/sample/tpu/__init__.py diff --git a/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py similarity index 100% rename from v1/sample/tpu/metadata.py rename to vllm/v1/sample/tpu/metadata.py diff --git a/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py similarity index 100% rename from v1/sample/tpu/sampler.py rename to vllm/v1/sample/tpu/sampler.py diff --git a/v1/serial_utils.py b/vllm/v1/serial_utils.py similarity index 100% rename from v1/serial_utils.py rename to vllm/v1/serial_utils.py diff --git a/v1/spec_decode/__init__.py b/vllm/v1/spec_decode/__init__.py similarity index 100% rename from v1/spec_decode/__init__.py rename to vllm/v1/spec_decode/__init__.py diff --git a/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py similarity index 100% rename from v1/spec_decode/eagle.py rename to vllm/v1/spec_decode/eagle.py diff --git a/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py similarity index 100% rename from v1/spec_decode/medusa.py rename to vllm/v1/spec_decode/medusa.py diff --git a/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py similarity index 100% rename from v1/spec_decode/metadata.py rename to vllm/v1/spec_decode/metadata.py diff --git a/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py similarity index 100% rename from v1/spec_decode/metrics.py rename to vllm/v1/spec_decode/metrics.py diff --git a/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py similarity index 100% rename from v1/spec_decode/ngram_proposer.py rename to vllm/v1/spec_decode/ngram_proposer.py diff --git a/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py similarity index 100% rename from v1/spec_decode/utils.py rename to vllm/v1/spec_decode/utils.py diff --git a/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py similarity index 100% rename from v1/structured_output/__init__.py rename to vllm/v1/structured_output/__init__.py diff --git a/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py similarity index 100% rename from v1/structured_output/backend_guidance.py rename to vllm/v1/structured_output/backend_guidance.py diff --git a/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py similarity index 100% rename from v1/structured_output/backend_types.py rename to vllm/v1/structured_output/backend_types.py diff --git a/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py similarity index 100% rename from v1/structured_output/backend_xgrammar.py rename to vllm/v1/structured_output/backend_xgrammar.py diff --git a/v1/structured_output/request.py b/vllm/v1/structured_output/request.py similarity index 100% rename from v1/structured_output/request.py rename to vllm/v1/structured_output/request.py diff --git a/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py similarity index 100% rename from v1/structured_output/utils.py rename to vllm/v1/structured_output/utils.py diff --git a/v1/utils.py b/vllm/v1/utils.py similarity index 100% rename from v1/utils.py rename to vllm/v1/utils.py diff --git a/v1/worker/__init__.py b/vllm/v1/worker/__init__.py similarity index 100% rename from v1/worker/__init__.py rename to vllm/v1/worker/__init__.py diff --git a/v1/worker/block_table.py b/vllm/v1/worker/block_table.py similarity index 100% rename from v1/worker/block_table.py rename to vllm/v1/worker/block_table.py diff --git a/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py similarity index 100% rename from v1/worker/cpu_model_runner.py rename to vllm/v1/worker/cpu_model_runner.py diff --git a/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py similarity index 100% rename from v1/worker/cpu_worker.py rename to vllm/v1/worker/cpu_worker.py diff --git a/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py similarity index 100% rename from v1/worker/gpu_input_batch.py rename to vllm/v1/worker/gpu_input_batch.py diff --git a/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py similarity index 100% rename from v1/worker/gpu_model_runner.py rename to vllm/v1/worker/gpu_model_runner.py diff --git a/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py similarity index 100% rename from v1/worker/gpu_worker.py rename to vllm/v1/worker/gpu_worker.py diff --git a/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py similarity index 100% rename from v1/worker/lora_model_runner_mixin.py rename to vllm/v1/worker/lora_model_runner_mixin.py diff --git a/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py similarity index 100% rename from v1/worker/tpu_model_runner.py rename to vllm/v1/worker/tpu_model_runner.py diff --git a/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py similarity index 100% rename from v1/worker/tpu_worker.py rename to vllm/v1/worker/tpu_worker.py diff --git a/v1/worker/utils.py b/vllm/v1/worker/utils.py similarity index 100% rename from v1/worker/utils.py rename to vllm/v1/worker/utils.py diff --git a/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py similarity index 100% rename from v1/worker/worker_base.py rename to vllm/v1/worker/worker_base.py diff --git a/version.py b/vllm/version.py similarity index 100% rename from version.py rename to vllm/version.py diff --git a/vllm_flash_attn/.gitkeep b/vllm/vllm_flash_attn/.gitkeep similarity index 100% rename from vllm_flash_attn/.gitkeep rename to vllm/vllm_flash_attn/.gitkeep diff --git a/worker/__init__.py b/vllm/worker/__init__.py similarity index 100% rename from worker/__init__.py rename to vllm/worker/__init__.py diff --git a/worker/cache_engine.py b/vllm/worker/cache_engine.py similarity index 100% rename from worker/cache_engine.py rename to vllm/worker/cache_engine.py diff --git a/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py similarity index 100% rename from worker/cpu_enc_dec_model_runner.py rename to vllm/worker/cpu_enc_dec_model_runner.py diff --git a/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py similarity index 100% rename from worker/cpu_model_runner.py rename to vllm/worker/cpu_model_runner.py diff --git a/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py similarity index 100% rename from worker/cpu_pooling_model_runner.py rename to vllm/worker/cpu_pooling_model_runner.py diff --git a/worker/cpu_worker.py b/vllm/worker/cpu_worker.py similarity index 100% rename from worker/cpu_worker.py rename to vllm/worker/cpu_worker.py diff --git a/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py similarity index 100% rename from worker/enc_dec_model_runner.py rename to vllm/worker/enc_dec_model_runner.py diff --git a/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py similarity index 100% rename from worker/hpu_model_runner.py rename to vllm/worker/hpu_model_runner.py diff --git a/worker/hpu_worker.py b/vllm/worker/hpu_worker.py similarity index 100% rename from worker/hpu_worker.py rename to vllm/worker/hpu_worker.py diff --git a/worker/model_runner.py b/vllm/worker/model_runner.py similarity index 100% rename from worker/model_runner.py rename to vllm/worker/model_runner.py diff --git a/worker/model_runner_base.py b/vllm/worker/model_runner_base.py similarity index 100% rename from worker/model_runner_base.py rename to vllm/worker/model_runner_base.py diff --git a/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py similarity index 100% rename from worker/multi_step_hpu_worker.py rename to vllm/worker/multi_step_hpu_worker.py diff --git a/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py similarity index 100% rename from worker/multi_step_model_runner.py rename to vllm/worker/multi_step_model_runner.py diff --git a/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py similarity index 100% rename from worker/multi_step_neuron_model_runner.py rename to vllm/worker/multi_step_neuron_model_runner.py diff --git a/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py similarity index 100% rename from worker/multi_step_neuronx_distributed_model_runner.py rename to vllm/worker/multi_step_neuronx_distributed_model_runner.py diff --git a/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py similarity index 100% rename from worker/multi_step_tpu_worker.py rename to vllm/worker/multi_step_tpu_worker.py diff --git a/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py similarity index 100% rename from worker/multi_step_worker.py rename to vllm/worker/multi_step_worker.py diff --git a/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py similarity index 100% rename from worker/neuron_model_runner.py rename to vllm/worker/neuron_model_runner.py diff --git a/worker/neuron_worker.py b/vllm/worker/neuron_worker.py similarity index 100% rename from worker/neuron_worker.py rename to vllm/worker/neuron_worker.py diff --git a/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py similarity index 100% rename from worker/neuronx_distributed_model_runner.py rename to vllm/worker/neuronx_distributed_model_runner.py diff --git a/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py similarity index 100% rename from worker/pooling_model_runner.py rename to vllm/worker/pooling_model_runner.py diff --git a/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py similarity index 100% rename from worker/tpu_model_runner.py rename to vllm/worker/tpu_model_runner.py diff --git a/worker/tpu_worker.py b/vllm/worker/tpu_worker.py similarity index 100% rename from worker/tpu_worker.py rename to vllm/worker/tpu_worker.py diff --git a/worker/utils.py b/vllm/worker/utils.py similarity index 100% rename from worker/utils.py rename to vllm/worker/utils.py diff --git a/worker/worker.py b/vllm/worker/worker.py similarity index 100% rename from worker/worker.py rename to vllm/worker/worker.py diff --git a/worker/worker_base.py b/vllm/worker/worker_base.py similarity index 100% rename from worker/worker_base.py rename to vllm/worker/worker_base.py diff --git a/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py similarity index 100% rename from worker/xpu_model_runner.py rename to vllm/worker/xpu_model_runner.py diff --git a/worker/xpu_worker.py b/vllm/worker/xpu_worker.py similarity index 100% rename from worker/xpu_worker.py rename to vllm/worker/xpu_worker.py