forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
227
vllm-v0.6.2/vllm.egg-info/PKG-INFO
Normal file
227
vllm-v0.6.2/vllm.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,227 @@
|
||||
Metadata-Version: 2.2
|
||||
Name: vllm
|
||||
Version: 0.6.4.post1+mlu0.6.2.pt2.5
|
||||
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs on MLU backendon
|
||||
Home-page:
|
||||
Author: Cambricon vLLM Team
|
||||
License: Apache 2.0
|
||||
Project-URL: Homepage, https://github.com/vllm-project/vllm
|
||||
Project-URL: Documentation, https://vllm.readthedocs.io/en/latest/
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Intended Audience :: Information Technology
|
||||
Classifier: Intended Audience :: Science/Research
|
||||
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
||||
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: psutil
|
||||
Requires-Dist: sentencepiece
|
||||
Requires-Dist: numpy<2.0.0
|
||||
Requires-Dist: requests>=2.26.0
|
||||
Requires-Dist: tqdm
|
||||
Requires-Dist: py-cpuinfo
|
||||
Requires-Dist: transformers>=4.45.2
|
||||
Requires-Dist: tokenizers>=0.19.1
|
||||
Requires-Dist: protobuf
|
||||
Requires-Dist: fastapi<0.113.0,>=0.107.0; python_version < "3.9"
|
||||
Requires-Dist: fastapi!=0.113.*,!=0.114.0,>=0.107.0; python_version >= "3.9"
|
||||
Requires-Dist: aiohttp
|
||||
Requires-Dist: openai>=1.45.0
|
||||
Requires-Dist: uvicorn[standard]
|
||||
Requires-Dist: pydantic>=2.9
|
||||
Requires-Dist: pillow
|
||||
Requires-Dist: prometheus_client>=0.18.0
|
||||
Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
|
||||
Requires-Dist: tiktoken>=0.6.0
|
||||
Requires-Dist: lm-format-enforcer<0.11,>=0.10.9
|
||||
Requires-Dist: outlines<0.1,>=0.0.43
|
||||
Requires-Dist: typing_extensions>=4.10
|
||||
Requires-Dist: filelock>=3.10.4
|
||||
Requires-Dist: partial-json-parser
|
||||
Requires-Dist: pyzmq
|
||||
Requires-Dist: msgspec
|
||||
Requires-Dist: gguf==0.10.0
|
||||
Requires-Dist: importlib_metadata
|
||||
Requires-Dist: mistral_common[opencv]>=1.5.0
|
||||
Requires-Dist: pyyaml
|
||||
Requires-Dist: six>=1.16.0; python_version > "3.11"
|
||||
Requires-Dist: setuptools>=74.1.1; python_version > "3.11"
|
||||
Requires-Dist: einops
|
||||
Requires-Dist: compressed-tensors==0.8.0
|
||||
Requires-Dist: tensorizer
|
||||
Requires-Dist: matplotlib>=3.7.4
|
||||
Requires-Dist: accelerate
|
||||
Requires-Dist: loguru
|
||||
Requires-Dist: ray==2.40.0
|
||||
Requires-Dist: triton==3.0.0
|
||||
Requires-Dist: torch==2.5.0
|
||||
Requires-Dist: torch-mlu>=1.23.1
|
||||
Requires-Dist: torch_mlu_ops>=1.2.2
|
||||
Requires-Dist: xformers==0.0.24
|
||||
Requires-Dist: datasets
|
||||
Requires-Dist: transformers_stream_generator
|
||||
Requires-Dist: huggingface-hub==0.25.2
|
||||
Provides-Extra: tensorizer
|
||||
Requires-Dist: tensorizer>=2.9.0; extra == "tensorizer"
|
||||
Provides-Extra: audio
|
||||
Requires-Dist: librosa; extra == "audio"
|
||||
Requires-Dist: soundfile; extra == "audio"
|
||||
Provides-Extra: video
|
||||
Requires-Dist: decord; extra == "video"
|
||||
Dynamic: author
|
||||
Dynamic: classifier
|
||||
Dynamic: description
|
||||
Dynamic: description-content-type
|
||||
Dynamic: license
|
||||
Dynamic: project-url
|
||||
Dynamic: provides-extra
|
||||
Dynamic: requires-dist
|
||||
Dynamic: requires-python
|
||||
Dynamic: summary
|
||||
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||
</picture>
|
||||
</p>
|
||||
|
||||
<h3 align="center">
|
||||
Easy, fast, and cheap LLM serving for everyone
|
||||
</h3>
|
||||
|
||||
<p align="center">
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
|
||||
|
||||
</p>
|
||||
|
||||
|
||||
---
|
||||
|
||||
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
|
||||
|
||||
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
|
||||
Join us to learn more about recent advancements of vLLM on MI300X.
|
||||
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||
|
||||
---
|
||||
## About
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
vLLM is fast with:
|
||||
|
||||
- State-of-the-art serving throughput
|
||||
- Efficient management of attention key and value memory with **PagedAttention**
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
- Seamless integration with popular Hugging Face models
|
||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||
- Prefix caching support
|
||||
- Multi-lora support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||
- Embedding Models (e.g. E5-Mistral)
|
||||
- Multi-modal LLMs (e.g., LLaVA)
|
||||
|
||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
||||
## Getting Started
|
||||
|
||||
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||
|
||||
```bash
|
||||
pip install vllm
|
||||
```
|
||||
|
||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome and value any contributions and collaborations.
|
||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||
|
||||
## Sponsors
|
||||
|
||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||
|
||||
<!-- Note: Please sort them in alphabetical order. -->
|
||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||
|
||||
- a16z
|
||||
- AMD
|
||||
- Anyscale
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Dropbox
|
||||
- Google Cloud
|
||||
- Lambda Lab
|
||||
- NVIDIA
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
- Sequoia Capital
|
||||
- Skywork AI
|
||||
- Trainy
|
||||
- UC Berkeley
|
||||
- UC San Diego
|
||||
- ZhenFund
|
||||
|
||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||
|
||||
## Citation
|
||||
|
||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||
```bibtex
|
||||
@inproceedings{kwon2023efficient,
|
||||
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
||||
author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
|
||||
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
## Contact Us
|
||||
|
||||
* For technical questions and feature requests, please use Github issues or discussions.
|
||||
* For discussing with fellow users, please use Discord.
|
||||
* For security disclosures, please use Github's security advisory feature.
|
||||
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||
621
vllm-v0.6.2/vllm.egg-info/SOURCES.txt
Normal file
621
vllm-v0.6.2/vllm.egg-info/SOURCES.txt
Normal file
@@ -0,0 +1,621 @@
|
||||
CMakeLists.txt
|
||||
LICENSE
|
||||
MANIFEST.in
|
||||
README.md
|
||||
pyproject.toml
|
||||
requirements-common.txt
|
||||
requirements-cpu.txt
|
||||
requirements-cuda.txt
|
||||
requirements-neuron.txt
|
||||
requirements-rocm.txt
|
||||
setup.py
|
||||
cmake/cpu_extension.cmake
|
||||
cmake/hipify.py
|
||||
cmake/utils.cmake
|
||||
ray_mlu/__init__.py
|
||||
ray_mlu/mlu.py
|
||||
ray_mlu/node.py
|
||||
ray_mlu/nsight.py
|
||||
ray_mlu/test_mlu.py
|
||||
tests/test_cache_block_hashing.py
|
||||
tests/test_config.py
|
||||
tests/test_embedded_commit.py
|
||||
tests/test_inputs.py
|
||||
tests/test_logger.py
|
||||
tests/test_logits_processor.py
|
||||
tests/test_regression.py
|
||||
tests/test_sampling_params.py
|
||||
tests/test_scalartype.py
|
||||
tests/test_sequence.py
|
||||
tests/test_sharded_state_loader.py
|
||||
tests/test_utils.py
|
||||
vllm/__init__.py
|
||||
vllm/_custom_ops.py
|
||||
vllm/_ipex_ops.py
|
||||
vllm/_mlu_ops.py
|
||||
vllm/beam_search.py
|
||||
vllm/block.py
|
||||
vllm/config.py
|
||||
vllm/connections.py
|
||||
vllm/envs.py
|
||||
vllm/forward_context.py
|
||||
vllm/logger.py
|
||||
vllm/logits_process.py
|
||||
vllm/outputs.py
|
||||
vllm/pooling_params.py
|
||||
vllm/py.typed
|
||||
vllm/sampling_params.py
|
||||
vllm/scalar_type.py
|
||||
vllm/scripts.py
|
||||
vllm/sequence.py
|
||||
vllm/tracing.py
|
||||
vllm/utils.py
|
||||
vllm/version.py
|
||||
vllm/version_config
|
||||
vllm.egg-info/PKG-INFO
|
||||
vllm.egg-info/SOURCES.txt
|
||||
vllm.egg-info/dependency_links.txt
|
||||
vllm.egg-info/entry_points.txt
|
||||
vllm.egg-info/requires.txt
|
||||
vllm.egg-info/top_level.txt
|
||||
vllm/adapter_commons/__init__.py
|
||||
vllm/adapter_commons/layers.py
|
||||
vllm/adapter_commons/models.py
|
||||
vllm/adapter_commons/request.py
|
||||
vllm/adapter_commons/utils.py
|
||||
vllm/adapter_commons/worker_manager.py
|
||||
vllm/assets/__init__.py
|
||||
vllm/assets/audio.py
|
||||
vllm/assets/base.py
|
||||
vllm/assets/image.py
|
||||
vllm/assets/video.py
|
||||
vllm/attention/__init__.py
|
||||
vllm/attention/layer.py
|
||||
vllm/attention/selector.py
|
||||
vllm/attention/backends/__init__.py
|
||||
vllm/attention/backends/abstract.py
|
||||
vllm/attention/backends/blocksparse_attn.py
|
||||
vllm/attention/backends/flash_attn.py
|
||||
vllm/attention/backends/flashinfer.py
|
||||
vllm/attention/backends/hpu_attn.py
|
||||
vllm/attention/backends/ipex_attn.py
|
||||
vllm/attention/backends/mlu_attn.py
|
||||
vllm/attention/backends/openvino.py
|
||||
vllm/attention/backends/pallas.py
|
||||
vllm/attention/backends/placeholder_attn.py
|
||||
vllm/attention/backends/rocm_flash_attn.py
|
||||
vllm/attention/backends/torch_sdpa.py
|
||||
vllm/attention/backends/utils.py
|
||||
vllm/attention/backends/xformers.py
|
||||
vllm/attention/ops/__init__.py
|
||||
vllm/attention/ops/hpu_paged_attn.py
|
||||
vllm/attention/ops/ipex_attn.py
|
||||
vllm/attention/ops/paged_attn.py
|
||||
vllm/attention/ops/prefix_prefill.py
|
||||
vllm/attention/ops/triton_flash_attention.py
|
||||
vllm/attention/ops/blocksparse_attention/__init__.py
|
||||
vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
|
||||
vllm/attention/ops/blocksparse_attention/interface.py
|
||||
vllm/attention/ops/blocksparse_attention/utils.py
|
||||
vllm/compilation/__init__.py
|
||||
vllm/compilation/backends.py
|
||||
vllm/compilation/compile_context.py
|
||||
vllm/compilation/config.py
|
||||
vllm/compilation/counter.py
|
||||
vllm/compilation/decorators.py
|
||||
vllm/compilation/fusion.py
|
||||
vllm/compilation/inductor_pass.py
|
||||
vllm/compilation/levels.py
|
||||
vllm/compilation/reshapes.py
|
||||
vllm/compilation/wrapper.py
|
||||
vllm/core/__init__.py
|
||||
vllm/core/block_manager.py
|
||||
vllm/core/evictor.py
|
||||
vllm/core/interfaces.py
|
||||
vllm/core/placeholder_block_space_manager.py
|
||||
vllm/core/scheduler.py
|
||||
vllm/core/block/__init__.py
|
||||
vllm/core/block/block_table.py
|
||||
vllm/core/block/common.py
|
||||
vllm/core/block/cpu_gpu_block_allocator.py
|
||||
vllm/core/block/interfaces.py
|
||||
vllm/core/block/naive_block.py
|
||||
vllm/core/block/prefix_caching_block.py
|
||||
vllm/core/block/utils.py
|
||||
vllm/distributed/__init__.py
|
||||
vllm/distributed/communication_op.py
|
||||
vllm/distributed/parallel_state.py
|
||||
vllm/distributed/utils.py
|
||||
vllm/distributed/device_communicators/__init__.py
|
||||
vllm/distributed/device_communicators/cuda_wrapper.py
|
||||
vllm/distributed/device_communicators/custom_all_reduce.py
|
||||
vllm/distributed/device_communicators/custom_all_reduce_utils.py
|
||||
vllm/distributed/device_communicators/hpu_communicator.py
|
||||
vllm/distributed/device_communicators/pynccl.py
|
||||
vllm/distributed/device_communicators/pynccl_wrapper.py
|
||||
vllm/distributed/device_communicators/shm_broadcast.py
|
||||
vllm/distributed/device_communicators/tpu_communicator.py
|
||||
vllm/distributed/device_communicators/xpu_communicator.py
|
||||
vllm/engine/__init__.py
|
||||
vllm/engine/arg_utils.py
|
||||
vllm/engine/async_llm_engine.py
|
||||
vllm/engine/async_timeout.py
|
||||
vllm/engine/llm_engine.py
|
||||
vllm/engine/metrics.py
|
||||
vllm/engine/metrics_types.py
|
||||
vllm/engine/protocol.py
|
||||
vllm/engine/multiprocessing/__init__.py
|
||||
vllm/engine/multiprocessing/client.py
|
||||
vllm/engine/multiprocessing/engine.py
|
||||
vllm/engine/output_processor/__init__.py
|
||||
vllm/engine/output_processor/interfaces.py
|
||||
vllm/engine/output_processor/multi_step.py
|
||||
vllm/engine/output_processor/single_step.py
|
||||
vllm/engine/output_processor/stop_checker.py
|
||||
vllm/engine/output_processor/util.py
|
||||
vllm/entrypoints/__init__.py
|
||||
vllm/entrypoints/api_server.py
|
||||
vllm/entrypoints/chat_utils.py
|
||||
vllm/entrypoints/launcher.py
|
||||
vllm/entrypoints/llm.py
|
||||
vllm/entrypoints/logger.py
|
||||
vllm/entrypoints/openai/__init__.py
|
||||
vllm/entrypoints/openai/api_server.py
|
||||
vllm/entrypoints/openai/cli_args.py
|
||||
vllm/entrypoints/openai/logits_processors.py
|
||||
vllm/entrypoints/openai/protocol.py
|
||||
vllm/entrypoints/openai/run_batch.py
|
||||
vllm/entrypoints/openai/serving_chat.py
|
||||
vllm/entrypoints/openai/serving_completion.py
|
||||
vllm/entrypoints/openai/serving_embedding.py
|
||||
vllm/entrypoints/openai/serving_engine.py
|
||||
vllm/entrypoints/openai/serving_tokenization.py
|
||||
vllm/entrypoints/openai/tool_parsers/__init__.py
|
||||
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
|
||||
vllm/entrypoints/openai/tool_parsers/utils.py
|
||||
vllm/executor/__init__.py
|
||||
vllm/executor/cpu_executor.py
|
||||
vllm/executor/distributed_gpu_executor.py
|
||||
vllm/executor/distributed_mlu_executor.py
|
||||
vllm/executor/executor_base.py
|
||||
vllm/executor/gpu_executor.py
|
||||
vllm/executor/hpu_executor.py
|
||||
vllm/executor/mlu_executor.py
|
||||
vllm/executor/msgspec_utils.py
|
||||
vllm/executor/multiproc_gpu_executor.py
|
||||
vllm/executor/multiproc_mlu_executor.py
|
||||
vllm/executor/multiproc_worker_utils.py
|
||||
vllm/executor/multiproc_xpu_executor.py
|
||||
vllm/executor/neuron_executor.py
|
||||
vllm/executor/openvino_executor.py
|
||||
vllm/executor/ray_gpu_executor.py
|
||||
vllm/executor/ray_hpu_executor.py
|
||||
vllm/executor/ray_mlu_executor.py
|
||||
vllm/executor/ray_tpu_executor.py
|
||||
vllm/executor/ray_utils.py
|
||||
vllm/executor/ray_xpu_executor.py
|
||||
vllm/executor/tpu_executor.py
|
||||
vllm/executor/xpu_executor.py
|
||||
vllm/inputs/__init__.py
|
||||
vllm/inputs/data.py
|
||||
vllm/inputs/parse.py
|
||||
vllm/inputs/preprocess.py
|
||||
vllm/inputs/registry.py
|
||||
vllm/logging_utils/__init__.py
|
||||
vllm/logging_utils/formatter.py
|
||||
vllm/lora/__init__.py
|
||||
vllm/lora/fully_sharded_layers.py
|
||||
vllm/lora/layers.py
|
||||
vllm/lora/lora.py
|
||||
vllm/lora/models.py
|
||||
vllm/lora/punica.py
|
||||
vllm/lora/request.py
|
||||
vllm/lora/utils.py
|
||||
vllm/lora/worker_manager.py
|
||||
vllm/lora/ops/__init__.py
|
||||
vllm/lora/ops/bgmv_expand.py
|
||||
vllm/lora/ops/bgmv_expand_slice.py
|
||||
vllm/lora/ops/bgmv_shrink.py
|
||||
vllm/lora/ops/sgmv_expand.py
|
||||
vllm/lora/ops/sgmv_expand_slice.py
|
||||
vllm/lora/ops/sgmv_shrink.py
|
||||
vllm/lora/ops/utils.py
|
||||
vllm/model_executor/__init__.py
|
||||
vllm/model_executor/custom_op.py
|
||||
vllm/model_executor/parameter.py
|
||||
vllm/model_executor/pooling_metadata.py
|
||||
vllm/model_executor/sampling_metadata.py
|
||||
vllm/model_executor/utils.py
|
||||
vllm/model_executor/guided_decoding/__init__.py
|
||||
vllm/model_executor/guided_decoding/guided_fields.py
|
||||
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
|
||||
vllm/model_executor/guided_decoding/outlines_decoding.py
|
||||
vllm/model_executor/guided_decoding/outlines_logits_processors.py
|
||||
vllm/model_executor/layers/__init__.py
|
||||
vllm/model_executor/layers/activation.py
|
||||
vllm/model_executor/layers/layernorm.py
|
||||
vllm/model_executor/layers/linear.py
|
||||
vllm/model_executor/layers/logits_processor.py
|
||||
vllm/model_executor/layers/pooler.py
|
||||
vllm/model_executor/layers/rejection_sampler.py
|
||||
vllm/model_executor/layers/resampler.py
|
||||
vllm/model_executor/layers/rotary_embedding.py
|
||||
vllm/model_executor/layers/sampler.py
|
||||
vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||
vllm/model_executor/layers/typical_acceptance_sampler.py
|
||||
vllm/model_executor/layers/vocab_parallel_embedding.py
|
||||
vllm/model_executor/layers/fused_moe/__init__.py
|
||||
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
|
||||
vllm/model_executor/layers/fused_moe/fused_moe.py
|
||||
vllm/model_executor/layers/fused_moe/layer.py
|
||||
vllm/model_executor/layers/fused_moe/moe_pallas.py
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
|
||||
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
|
||||
vllm/model_executor/layers/mamba/__init__.py
|
||||
vllm/model_executor/layers/mamba/mamba_mixer.py
|
||||
vllm/model_executor/layers/mamba/ops/__init__.py
|
||||
vllm/model_executor/layers/mamba/ops/causal_conv1d.py
|
||||
vllm/model_executor/layers/mamba/ops/mamba_ssm.py
|
||||
vllm/model_executor/layers/quantization/__init__.py
|
||||
vllm/model_executor/layers/quantization/aqlm.py
|
||||
vllm/model_executor/layers/quantization/awq.py
|
||||
vllm/model_executor/layers/quantization/awq_marlin.py
|
||||
vllm/model_executor/layers/quantization/awq_triton.py
|
||||
vllm/model_executor/layers/quantization/base_config.py
|
||||
vllm/model_executor/layers/quantization/bitsandbytes.py
|
||||
vllm/model_executor/layers/quantization/deepspeedfp.py
|
||||
vllm/model_executor/layers/quantization/experts_int8.py
|
||||
vllm/model_executor/layers/quantization/fbgemm_fp8.py
|
||||
vllm/model_executor/layers/quantization/fp8.py
|
||||
vllm/model_executor/layers/quantization/gguf.py
|
||||
vllm/model_executor/layers/quantization/gptq.py
|
||||
vllm/model_executor/layers/quantization/gptq_marlin.py
|
||||
vllm/model_executor/layers/quantization/gptq_marlin_24.py
|
||||
vllm/model_executor/layers/quantization/ipex_quant.py
|
||||
vllm/model_executor/layers/quantization/kv_cache.py
|
||||
vllm/model_executor/layers/quantization/marlin.py
|
||||
vllm/model_executor/layers/quantization/modelopt.py
|
||||
vllm/model_executor/layers/quantization/neuron_quant.py
|
||||
vllm/model_executor/layers/quantization/qqq.py
|
||||
vllm/model_executor/layers/quantization/schema.py
|
||||
vllm/model_executor/layers/quantization/tpu_int8.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
|
||||
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
|
||||
vllm/model_executor/layers/quantization/kernels/MPLinearKernel.py
|
||||
vllm/model_executor/layers/quantization/kernels/__init__.py
|
||||
vllm/model_executor/layers/quantization/kernels/exllama.py
|
||||
vllm/model_executor/layers/quantization/kernels/machete.py
|
||||
vllm/model_executor/layers/quantization/kernels/marlin.py
|
||||
vllm/model_executor/layers/quantization/utils/__init__.py
|
||||
vllm/model_executor/layers/quantization/utils/layer_utils.py
|
||||
vllm/model_executor/layers/quantization/utils/machete_utils.py
|
||||
vllm/model_executor/layers/quantization/utils/marlin_utils.py
|
||||
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
|
||||
vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
|
||||
vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
|
||||
vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
|
||||
vllm/model_executor/layers/quantization/utils/quant_utils.py
|
||||
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
|
||||
vllm/model_executor/model_loader/__init__.py
|
||||
vllm/model_executor/model_loader/loader.py
|
||||
vllm/model_executor/model_loader/neuron.py
|
||||
vllm/model_executor/model_loader/openvino.py
|
||||
vllm/model_executor/model_loader/tensorizer.py
|
||||
vllm/model_executor/model_loader/utils.py
|
||||
vllm/model_executor/model_loader/weight_utils.py
|
||||
vllm/model_executor/models/__init__.py
|
||||
vllm/model_executor/models/arctic.py
|
||||
vllm/model_executor/models/baichuan.py
|
||||
vllm/model_executor/models/bart.py
|
||||
vllm/model_executor/models/bert.py
|
||||
vllm/model_executor/models/blip.py
|
||||
vllm/model_executor/models/blip2.py
|
||||
vllm/model_executor/models/bloom.py
|
||||
vllm/model_executor/models/chameleon.py
|
||||
vllm/model_executor/models/chatglm.py
|
||||
vllm/model_executor/models/clip.py
|
||||
vllm/model_executor/models/commandr.py
|
||||
vllm/model_executor/models/dbrx.py
|
||||
vllm/model_executor/models/decilm.py
|
||||
vllm/model_executor/models/deepseek.py
|
||||
vllm/model_executor/models/deepseek_v2.py
|
||||
vllm/model_executor/models/eagle.py
|
||||
vllm/model_executor/models/exaone.py
|
||||
vllm/model_executor/models/falcon.py
|
||||
vllm/model_executor/models/florence2.py
|
||||
vllm/model_executor/models/fuyu.py
|
||||
vllm/model_executor/models/gemma.py
|
||||
vllm/model_executor/models/gemma2.py
|
||||
vllm/model_executor/models/glm4_vision_encoder.py
|
||||
vllm/model_executor/models/gpt2.py
|
||||
vllm/model_executor/models/gpt_bigcode.py
|
||||
vllm/model_executor/models/gpt_j.py
|
||||
vllm/model_executor/models/gpt_neox.py
|
||||
vllm/model_executor/models/granite.py
|
||||
vllm/model_executor/models/granitemoe.py
|
||||
vllm/model_executor/models/h2ovl.py
|
||||
vllm/model_executor/models/hunyuan.py
|
||||
vllm/model_executor/models/idefics2_vision_model.py
|
||||
vllm/model_executor/models/idefics3.py
|
||||
vllm/model_executor/models/interfaces.py
|
||||
vllm/model_executor/models/interfaces_base.py
|
||||
vllm/model_executor/models/intern_vit.py
|
||||
vllm/model_executor/models/internlm2.py
|
||||
vllm/model_executor/models/internlm2_ve.py
|
||||
vllm/model_executor/models/internvl.py
|
||||
vllm/model_executor/models/jais.py
|
||||
vllm/model_executor/models/jamba.py
|
||||
vllm/model_executor/models/llama.py
|
||||
vllm/model_executor/models/llava.py
|
||||
vllm/model_executor/models/llava_next.py
|
||||
vllm/model_executor/models/llava_next_video.py
|
||||
vllm/model_executor/models/llava_onevision.py
|
||||
vllm/model_executor/models/mamba.py
|
||||
vllm/model_executor/models/mamba_cache.py
|
||||
vllm/model_executor/models/medusa.py
|
||||
vllm/model_executor/models/minicpm.py
|
||||
vllm/model_executor/models/minicpm3.py
|
||||
vllm/model_executor/models/minicpmv.py
|
||||
vllm/model_executor/models/mixtral.py
|
||||
vllm/model_executor/models/mixtral_quant.py
|
||||
vllm/model_executor/models/mllama.py
|
||||
vllm/model_executor/models/mlp_speculator.py
|
||||
vllm/model_executor/models/module_mapping.py
|
||||
vllm/model_executor/models/molmo.py
|
||||
vllm/model_executor/models/mpt.py
|
||||
vllm/model_executor/models/nemotron.py
|
||||
vllm/model_executor/models/nvlm_d.py
|
||||
vllm/model_executor/models/olmo.py
|
||||
vllm/model_executor/models/olmoe.py
|
||||
vllm/model_executor/models/opt.py
|
||||
vllm/model_executor/models/orion.py
|
||||
vllm/model_executor/models/paligemma.py
|
||||
vllm/model_executor/models/persimmon.py
|
||||
vllm/model_executor/models/phi.py
|
||||
vllm/model_executor/models/phi3.py
|
||||
vllm/model_executor/models/phi3_small.py
|
||||
vllm/model_executor/models/phi3v.py
|
||||
vllm/model_executor/models/phimoe.py
|
||||
vllm/model_executor/models/pixtral.py
|
||||
vllm/model_executor/models/qwen.py
|
||||
vllm/model_executor/models/qwen2.py
|
||||
vllm/model_executor/models/qwen2_audio.py
|
||||
vllm/model_executor/models/qwen2_cls.py
|
||||
vllm/model_executor/models/qwen2_moe.py
|
||||
vllm/model_executor/models/qwen2_rm.py
|
||||
vllm/model_executor/models/qwen2_vl.py
|
||||
vllm/model_executor/models/registry.py
|
||||
vllm/model_executor/models/roberta.py
|
||||
vllm/model_executor/models/siglip.py
|
||||
vllm/model_executor/models/solar.py
|
||||
vllm/model_executor/models/stablelm.py
|
||||
vllm/model_executor/models/starcoder2.py
|
||||
vllm/model_executor/models/ultravox.py
|
||||
vllm/model_executor/models/utils.py
|
||||
vllm/model_executor/models/xverse.py
|
||||
vllm/multimodal/__init__.py
|
||||
vllm/multimodal/audio.py
|
||||
vllm/multimodal/base.py
|
||||
vllm/multimodal/image.py
|
||||
vllm/multimodal/inputs.py
|
||||
vllm/multimodal/processing.py
|
||||
vllm/multimodal/registry.py
|
||||
vllm/multimodal/utils.py
|
||||
vllm/multimodal/video.py
|
||||
vllm/platforms/__init__.py
|
||||
vllm/platforms/cpu.py
|
||||
vllm/platforms/cuda.py
|
||||
vllm/platforms/hpu.py
|
||||
vllm/platforms/interface.py
|
||||
vllm/platforms/mlu.py
|
||||
vllm/platforms/neuron.py
|
||||
vllm/platforms/openvino.py
|
||||
vllm/platforms/rocm.py
|
||||
vllm/platforms/tpu.py
|
||||
vllm/platforms/xpu.py
|
||||
vllm/plugins/__init__.py
|
||||
vllm/profiler/__init__.py
|
||||
vllm/profiler/layerwise_profile.py
|
||||
vllm/profiler/utils.py
|
||||
vllm/prompt_adapter/__init__.py
|
||||
vllm/prompt_adapter/layers.py
|
||||
vllm/prompt_adapter/models.py
|
||||
vllm/prompt_adapter/request.py
|
||||
vllm/prompt_adapter/utils.py
|
||||
vllm/prompt_adapter/worker_manager.py
|
||||
vllm/spec_decode/__init__.py
|
||||
vllm/spec_decode/batch_expansion.py
|
||||
vllm/spec_decode/draft_model_runner.py
|
||||
vllm/spec_decode/interfaces.py
|
||||
vllm/spec_decode/medusa_worker.py
|
||||
vllm/spec_decode/metrics.py
|
||||
vllm/spec_decode/mlp_speculator_worker.py
|
||||
vllm/spec_decode/mlu_batch_expansion.py
|
||||
vllm/spec_decode/mlu_draft_model_runner.py
|
||||
vllm/spec_decode/mlu_medusa_worker.py
|
||||
vllm/spec_decode/mlu_metrics.py
|
||||
vllm/spec_decode/mlu_mlp_speculator_worker.py
|
||||
vllm/spec_decode/mlu_multi_step_worker.py
|
||||
vllm/spec_decode/mlu_ngram_worker.py
|
||||
vllm/spec_decode/mlu_smaller_tp_proposer_worker.py
|
||||
vllm/spec_decode/mlu_spec_decode_worker.py
|
||||
vllm/spec_decode/mlu_target_model_runner.py
|
||||
vllm/spec_decode/mqa_scorer.py
|
||||
vllm/spec_decode/multi_step_worker.py
|
||||
vllm/spec_decode/ngram_worker.py
|
||||
vllm/spec_decode/proposer_worker_base.py
|
||||
vllm/spec_decode/smaller_tp_proposer_worker.py
|
||||
vllm/spec_decode/spec_decode_worker.py
|
||||
vllm/spec_decode/target_model_runner.py
|
||||
vllm/spec_decode/top1_proposer.py
|
||||
vllm/spec_decode/util.py
|
||||
vllm/transformers_utils/__init__.py
|
||||
vllm/transformers_utils/config.py
|
||||
vllm/transformers_utils/detokenizer.py
|
||||
vllm/transformers_utils/detokenizer_utils.py
|
||||
vllm/transformers_utils/processor.py
|
||||
vllm/transformers_utils/tokenizer.py
|
||||
vllm/transformers_utils/utils.py
|
||||
vllm/transformers_utils/configs/__init__.py
|
||||
vllm/transformers_utils/configs/arctic.py
|
||||
vllm/transformers_utils/configs/chatglm.py
|
||||
vllm/transformers_utils/configs/dbrx.py
|
||||
vllm/transformers_utils/configs/eagle.py
|
||||
vllm/transformers_utils/configs/exaone.py
|
||||
vllm/transformers_utils/configs/falcon.py
|
||||
vllm/transformers_utils/configs/h2ovl.py
|
||||
vllm/transformers_utils/configs/internvl.py
|
||||
vllm/transformers_utils/configs/jais.py
|
||||
vllm/transformers_utils/configs/medusa.py
|
||||
vllm/transformers_utils/configs/mllama.py
|
||||
vllm/transformers_utils/configs/mlp_speculator.py
|
||||
vllm/transformers_utils/configs/mpt.py
|
||||
vllm/transformers_utils/configs/nemotron.py
|
||||
vllm/transformers_utils/configs/nvlm_d.py
|
||||
vllm/transformers_utils/configs/solar.py
|
||||
vllm/transformers_utils/configs/ultravox.py
|
||||
vllm/transformers_utils/tokenizer_group/__init__.py
|
||||
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
|
||||
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
|
||||
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
|
||||
vllm/transformers_utils/tokenizers/__init__.py
|
||||
vllm/transformers_utils/tokenizers/mistral.py
|
||||
vllm/triton_utils/__init__.py
|
||||
vllm/triton_utils/custom_cache_manager.py
|
||||
vllm/triton_utils/importing.py
|
||||
vllm/usage/__init__.py
|
||||
vllm/usage/usage_lib.py
|
||||
vllm/v1/__init__.py
|
||||
vllm/v1/outputs.py
|
||||
vllm/v1/request.py
|
||||
vllm/v1/serial_utils.py
|
||||
vllm/v1/utils.py
|
||||
vllm/v1/attention/__init__.py
|
||||
vllm/v1/attention/backends/__init__.py
|
||||
vllm/v1/attention/backends/flash_attn.py
|
||||
vllm/v1/core/__init__.py
|
||||
vllm/v1/core/encoder_cache_manager.py
|
||||
vllm/v1/core/kv_cache_manager.py
|
||||
vllm/v1/core/kv_cache_utils.py
|
||||
vllm/v1/core/scheduler.py
|
||||
vllm/v1/engine/__init__.py
|
||||
vllm/v1/engine/async_llm.py
|
||||
vllm/v1/engine/async_stream.py
|
||||
vllm/v1/engine/core.py
|
||||
vllm/v1/engine/core_client.py
|
||||
vllm/v1/engine/detokenizer.py
|
||||
vllm/v1/engine/llm_engine.py
|
||||
vllm/v1/engine/mm_input_mapper.py
|
||||
vllm/v1/engine/processor.py
|
||||
vllm/v1/executor/__init__.py
|
||||
vllm/v1/executor/gpu_executor.py
|
||||
vllm/v1/sample/__init__.py
|
||||
vllm/v1/sample/metadata.py
|
||||
vllm/v1/sample/sampler.py
|
||||
vllm/v1/worker/__init__.py
|
||||
vllm/v1/worker/gpu_model_runner.py
|
||||
vllm/v1/worker/gpu_worker.py
|
||||
vllm/worker/__init__.py
|
||||
vllm/worker/cache_engine.py
|
||||
vllm/worker/cpu_embedding_model_runner.py
|
||||
vllm/worker/cpu_enc_dec_model_runner.py
|
||||
vllm/worker/cpu_model_runner.py
|
||||
vllm/worker/cpu_worker.py
|
||||
vllm/worker/embedding_model_runner.py
|
||||
vllm/worker/enc_dec_model_runner.py
|
||||
vllm/worker/hpu_model_runner.py
|
||||
vllm/worker/hpu_worker.py
|
||||
vllm/worker/mlu_enc_dec_model_runner.py
|
||||
vllm/worker/mlu_model_runner.py
|
||||
vllm/worker/mlu_multi_step_model_runner.py
|
||||
vllm/worker/mlu_multi_step_worker.py
|
||||
vllm/worker/mlu_worker.py
|
||||
vllm/worker/model_runner.py
|
||||
vllm/worker/model_runner_base.py
|
||||
vllm/worker/multi_step_model_runner.py
|
||||
vllm/worker/multi_step_tpu_worker.py
|
||||
vllm/worker/multi_step_worker.py
|
||||
vllm/worker/neuron_model_runner.py
|
||||
vllm/worker/neuron_worker.py
|
||||
vllm/worker/openvino_model_runner.py
|
||||
vllm/worker/openvino_worker.py
|
||||
vllm/worker/tpu_model_runner.py
|
||||
vllm/worker/tpu_worker.py
|
||||
vllm/worker/utils.py
|
||||
vllm/worker/worker.py
|
||||
vllm/worker/worker_base.py
|
||||
vllm/worker/xpu_model_runner.py
|
||||
vllm/worker/xpu_worker.py
|
||||
1
vllm-v0.6.2/vllm.egg-info/dependency_links.txt
Normal file
1
vllm-v0.6.2/vllm.egg-info/dependency_links.txt
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
2
vllm-v0.6.2/vllm.egg-info/entry_points.txt
Normal file
2
vllm-v0.6.2/vllm.egg-info/entry_points.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
vllm = vllm.scripts:main
|
||||
63
vllm-v0.6.2/vllm.egg-info/requires.txt
Normal file
63
vllm-v0.6.2/vllm.egg-info/requires.txt
Normal file
@@ -0,0 +1,63 @@
|
||||
psutil
|
||||
sentencepiece
|
||||
numpy<2.0.0
|
||||
requests>=2.26.0
|
||||
tqdm
|
||||
py-cpuinfo
|
||||
transformers>=4.45.2
|
||||
tokenizers>=0.19.1
|
||||
protobuf
|
||||
aiohttp
|
||||
openai>=1.45.0
|
||||
uvicorn[standard]
|
||||
pydantic>=2.9
|
||||
pillow
|
||||
prometheus_client>=0.18.0
|
||||
prometheus-fastapi-instrumentator>=7.0.0
|
||||
tiktoken>=0.6.0
|
||||
lm-format-enforcer<0.11,>=0.10.9
|
||||
outlines<0.1,>=0.0.43
|
||||
typing_extensions>=4.10
|
||||
filelock>=3.10.4
|
||||
partial-json-parser
|
||||
pyzmq
|
||||
msgspec
|
||||
gguf==0.10.0
|
||||
importlib_metadata
|
||||
mistral_common[opencv]>=1.5.0
|
||||
pyyaml
|
||||
einops
|
||||
compressed-tensors==0.8.0
|
||||
tensorizer
|
||||
matplotlib>=3.7.4
|
||||
accelerate
|
||||
loguru
|
||||
ray==2.40.0
|
||||
triton==3.0.0
|
||||
torch==2.5.0
|
||||
torch-mlu>=1.23.1
|
||||
torch_mlu_ops>=1.2.2
|
||||
xformers==0.0.24
|
||||
datasets
|
||||
transformers_stream_generator
|
||||
huggingface-hub==0.25.2
|
||||
|
||||
[:python_version < "3.9"]
|
||||
fastapi<0.113.0,>=0.107.0
|
||||
|
||||
[:python_version > "3.11"]
|
||||
six>=1.16.0
|
||||
setuptools>=74.1.1
|
||||
|
||||
[:python_version >= "3.9"]
|
||||
fastapi!=0.113.*,!=0.114.0,>=0.107.0
|
||||
|
||||
[audio]
|
||||
librosa
|
||||
soundfile
|
||||
|
||||
[tensorizer]
|
||||
tensorizer>=2.9.0
|
||||
|
||||
[video]
|
||||
decord
|
||||
2
vllm-v0.6.2/vllm.egg-info/top_level.txt
Normal file
2
vllm-v0.6.2/vllm.egg-info/top_level.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
ray_mlu
|
||||
vllm
|
||||
Reference in New Issue
Block a user