Add typo checker in pre-commit (#6179)
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
@@ -33,6 +33,12 @@ repos:
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black-jupyter
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'python/pyproject.toml']
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v18.1.8
|
||||
hooks:
|
||||
|
||||
2
3rdparty/amd/tuning/TUNING.md
vendored
2
3rdparty/amd/tuning/TUNING.md
vendored
@@ -104,7 +104,7 @@ To maximize moe kernel efficiency, need to use below scripts to find out the bes
|
||||
|
||||
```bash
|
||||
#Tuning
|
||||
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
|
||||
#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
|
||||
#so we can tune decode moe use below command
|
||||
python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
|
||||
# and use this command to tune prefill moe
|
||||
|
||||
@@ -267,7 +267,7 @@ async def get_requests(
|
||||
try:
|
||||
request = await asyncio.wait_for(
|
||||
input_requests_queue.get(), timeout=300
|
||||
) # Wait for 5 minites then abort
|
||||
) # Wait for 5 minutes then abort
|
||||
except Exception as e:
|
||||
print(f"exception: {e}")
|
||||
break
|
||||
@@ -514,7 +514,7 @@ async def benchmark(
|
||||
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
||||
print(
|
||||
"{:<40} {:<10}".format(
|
||||
"Max reqeuest concurrency:",
|
||||
"Max request concurrency:",
|
||||
max_concurrency if max_concurrency else "not set",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -95,7 +95,7 @@ def bench_schema(args):
|
||||
latency = time.time() - tic
|
||||
|
||||
# Check if the outputs are valid
|
||||
indexs = []
|
||||
indexes = []
|
||||
for i, state in enumerate(states):
|
||||
try:
|
||||
schema = json.loads(arguments[i]["json_schema"])
|
||||
@@ -103,7 +103,7 @@ def bench_schema(args):
|
||||
assert jsonschema.validate(obj, schema) is None
|
||||
except Exception as e:
|
||||
print(e)
|
||||
indexs.append(i)
|
||||
indexes.append(i)
|
||||
|
||||
return states, latency
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from tqdm import tqdm
|
||||
|
||||
def generate_lines(random_words, num_lines, redirect_ratio):
|
||||
prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
|
||||
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
|
||||
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
|
||||
|
||||
# Raw lines
|
||||
visited_indices = set([None])
|
||||
|
||||
@@ -17,7 +17,7 @@ ASSISTANT_SUFFIX = " </s><s>"
|
||||
|
||||
def multi_document_qa(docs, question, generate):
|
||||
s = USER_PREFIX
|
||||
s += "Pleaes answer a question according to given documents.\n"
|
||||
s += "Please answer a question according to given documents.\n"
|
||||
s += "Question:" + question + "Documents begin.\n"
|
||||
|
||||
s += "".join(docs)
|
||||
|
||||
@@ -13,7 +13,7 @@ from sglang.utils import dump_state_text, read_jsonl
|
||||
@sgl.function
|
||||
def multi_document_qa(s, docs, question):
|
||||
s += sgl.user_begin()
|
||||
s += "Pleaes answer a question according to given documents.\n"
|
||||
s += "Please answer a question according to given documents.\n"
|
||||
s += "Question:" + question + "Documents begin.\n"
|
||||
|
||||
forks = s.fork(len(docs))
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
"source": [
|
||||
"# Tool and Function Calling\n",
|
||||
"\n",
|
||||
"This guide demonstrates how to use SGLang’s [Funcion calling](https://platform.openai.com/docs/guides/function-calling) functionality."
|
||||
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -399,7 +399,7 @@
|
||||
" },\n",
|
||||
"}\n",
|
||||
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
|
||||
"print_highlight(\"==== Reponse ====\")\n",
|
||||
"print_highlight(\"==== Response ====\")\n",
|
||||
"print(gen_response)\n",
|
||||
"\n",
|
||||
"# parse the response\n",
|
||||
|
||||
@@ -275,7 +275,7 @@
|
||||
"source": [
|
||||
"## Structured Outputs (JSON, Regex, EBNF)\n",
|
||||
"\n",
|
||||
"For OpenAI compatible structed outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
|
||||
"For OpenAI compatible structured outputs API, refer to [Structured Outputs](https://docs.sglang.ai/backend/structured_outputs.html#OpenAI-Compatible-API) for more details.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -40,7 +40,7 @@ The `/generate` endpoint accepts the following parameters in JSON format. For de
|
||||
| Argument | Type/Default | Description |
|
||||
|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
|
||||
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. |
|
||||
| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
|
||||
| min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
|
||||
|
||||
### Constrained decoding
|
||||
|
||||
@@ -166,7 +166,7 @@
|
||||
"source": [
|
||||
"## Using Native Generation APIs\n",
|
||||
"\n",
|
||||
"You can also use the native `/generate` endpoint with requests, which provides more flexiblity. An API reference is available at [Sampling Parameters](sampling_params.md)."
|
||||
"You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -378,7 +378,7 @@
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" model_type (str): Type of model to parse reasoning from\n",
|
||||
" stream_reasoning (bool): If Flase, accumulates reasoning content until complete.\n",
|
||||
" stream_reasoning (bool): If False, accumulates reasoning content until complete.\n",
|
||||
" If True, streams reasoning content as it arrives.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"\n",
|
||||
"### Performance Highlights\n",
|
||||
"\n",
|
||||
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be archieved via EAGLE3 decoding.\n",
|
||||
"Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
|
||||
"For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
|
||||
"\n",
|
||||
"| Method | Throughput (tokens/s) |\n",
|
||||
@@ -296,7 +296,7 @@
|
||||
"- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
|
||||
"- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
|
||||
"\n",
|
||||
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionaly to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
|
||||
"This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."
|
||||
|
||||
@@ -52,7 +52,7 @@ docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --net
|
||||
docker exec -it sglang_dev /bin/zsh
|
||||
```
|
||||
Some useful volumes to mount are:
|
||||
1. **Huggingface model cache**: mounting model cache can avoid re-download everytime docker restarts. Default location on Linux is `~/.cache/huggingface/`.
|
||||
1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
|
||||
2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
|
||||
|
||||
Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
|
||||
|
||||
@@ -29,7 +29,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a
|
||||
|
||||
**Notes**
|
||||
- Do not need to specify the runner group
|
||||
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings.
|
||||
- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
|
||||
- Do not need to change the work folder.
|
||||
|
||||
### Step 3: Run the runner by `run.sh`
|
||||
|
||||
@@ -32,7 +32,7 @@ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-
|
||||
|
||||
After the server is ready, you can directly send requests to the router as the same way as sending requests to each single worker.
|
||||
|
||||
Please adjust the batchsize accordingly to archieve maximum throughput.
|
||||
Please adjust the batchsize accordingly to achieve maximum throughput.
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
@@ -375,7 +375,7 @@
|
||||
"\n",
|
||||
"When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n",
|
||||
"\n",
|
||||
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrival step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
|
||||
"When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
|
||||
"\n",
|
||||
"Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n",
|
||||
"\n",
|
||||
|
||||
@@ -147,3 +147,7 @@ exclude = [
|
||||
"scripts*",
|
||||
"tests*",
|
||||
]
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
|
||||
skip = "*.json,*.jsonl,*.patch,*.txt"
|
||||
|
||||
@@ -315,7 +315,7 @@ def throughput_test(
|
||||
tokenizer_id = server_args.tokenizer_path or server_args.model_path
|
||||
tokenizer = get_tokenizer(tokenizer_id)
|
||||
|
||||
# Set global environmnets
|
||||
# Set global environments
|
||||
set_ulimit()
|
||||
random.seed(bench_args.seed)
|
||||
np.random.seed(bench_args.seed)
|
||||
|
||||
@@ -1263,7 +1263,7 @@ async def benchmark(
|
||||
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
||||
print(
|
||||
"{:<40} {:<10}".format(
|
||||
"Max reqeuest concurrency:",
|
||||
"Max request concurrency:",
|
||||
max_concurrency if max_concurrency else "not set",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
|
||||
|
||||
|
||||
def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
|
||||
# Disbale cuda graph and torch compile to save time
|
||||
# Disable cuda graph and torch compile to save time
|
||||
server_args.disable_cuda_graph = True
|
||||
server_args.enable_torch_compile = False
|
||||
print(f"Disable CUDA Graph and Torch Compile to save time...")
|
||||
|
||||
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
|
||||
with TracingScope(tracer):
|
||||
tracer.ret_value = program.func(tracer, **arguments)
|
||||
except (StopTracing, TypeError, AttributeError):
|
||||
# Some exceptions may not be catched
|
||||
# Some exceptions may not be caught
|
||||
pass
|
||||
|
||||
# Run and cache prefix
|
||||
|
||||
@@ -27,7 +27,7 @@ completion_template_name = None
|
||||
|
||||
|
||||
class FimPosition:
|
||||
"""Postion of fim middle token."""
|
||||
"""Position of fim middle token."""
|
||||
|
||||
MIDDLE = auto()
|
||||
END = auto()
|
||||
|
||||
@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
h = w = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
# global views tokens h * (w + 1), 1 is for line seperator
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a seperator between global and local views
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += (
|
||||
|
||||
@@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin:
|
||||
def event_loop_overlap_disagg_decode(self: Scheduler):
|
||||
result_queue = deque()
|
||||
self.last_batch: Optional[ScheduleBatch] = None
|
||||
self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
|
||||
self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend
|
||||
|
||||
while True:
|
||||
recv_reqs = self.recv_requests()
|
||||
|
||||
@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
|
||||
logger.info(f"FakeKVSender send success")
|
||||
else:
|
||||
self.has_sent = False
|
||||
logger.info(f"FakeKVSender send fake transfering")
|
||||
logger.info(f"FakeKVSender send fake transferring")
|
||||
|
||||
def failure_exception(self):
|
||||
raise Exception("Fake KVSender Exception")
|
||||
|
||||
@@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager):
|
||||
self.request_status[bootstrap_room] = KVPoll.WaitingForInput
|
||||
|
||||
def check_status(self, bootstrap_room: int):
|
||||
# TOOD: do we really need the poll()?
|
||||
# TODO: do we really need the poll()?
|
||||
|
||||
return self.request_status[bootstrap_room]
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
|
||||
|
||||
|
||||
def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
|
||||
# 1. The page is guaruanteed to be full except the last page.
|
||||
# 1. The page is guaranteed to be full except the last page.
|
||||
# 2. page index = kv_index // page_size
|
||||
# The return vector is kv_indices[::page_size] // page_size
|
||||
if page_size == 1: # shortcut
|
||||
|
||||
@@ -86,8 +86,8 @@ class StructureInfo:
|
||||
|
||||
_GetInfoFunc = Callable[[str], StructureInfo]
|
||||
"""
|
||||
helper alias of function
|
||||
ususally it is a function that takes a name string and returns a StructureInfo object,
|
||||
Helper alias of function
|
||||
Usually it is a function that takes a name string and returns a StructureInfo object,
|
||||
which can be used to construct a structural_tag object
|
||||
"""
|
||||
|
||||
|
||||
@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
|
||||
), "Sliding window and cross attention are not supported together"
|
||||
|
||||
self.forward_metadata: FlashAttentionMetadata = None
|
||||
# extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
|
||||
# extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
|
||||
self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
|
||||
self.max_context_len = model_runner.model_config.context_len
|
||||
self.device = model_runner.device
|
||||
|
||||
@@ -919,7 +919,7 @@ def _fwd_kernel(
|
||||
|
||||
e_max = n_e_max
|
||||
|
||||
# stage 2: compute the trianlge part
|
||||
# stage 2: compute the triangle part
|
||||
|
||||
cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
|
||||
for start_n in range(0, cur_block_m_end, BLOCK_N):
|
||||
|
||||
@@ -201,7 +201,7 @@ def _dp_gather(
|
||||
global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
|
||||
)
|
||||
|
||||
# Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
|
||||
# Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
|
||||
NUM_GPUS_PER_NODE = 8
|
||||
if (
|
||||
not local_tokens.dtype.is_floating_point
|
||||
|
||||
@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
if not x.is_contiguous():
|
||||
# NOTE: Romove this if aiter kernel supports discontinuous input
|
||||
# NOTE: Remove this if aiter kernel supports discontinuous input
|
||||
x = x.contiguous()
|
||||
if residual is not None:
|
||||
fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)
|
||||
|
||||
@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
|
||||
seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
|
||||
src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
|
||||
|
||||
# Find offet
|
||||
# Find offset
|
||||
expert_ids = torch.arange(
|
||||
num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
|
||||
)
|
||||
|
||||
@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -994,7 +994,7 @@ def get_default_config(
|
||||
"num_stages": 2 if _is_hip else 4,
|
||||
}
|
||||
else:
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_shape[0],
|
||||
|
||||
@@ -270,7 +270,7 @@ def select_experts(
|
||||
routed_scaling_factor: Optional[float] = None,
|
||||
):
|
||||
n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
|
||||
# DeekSeek V2/V3/R1 serices models uses grouped_top_k
|
||||
# DeepSeek V2/V3/R1 series models use grouped_top_k
|
||||
if use_grouped_topk:
|
||||
assert topk_group is not None
|
||||
assert num_expert_group is not None
|
||||
|
||||
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
||||
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
||||
raise ValueError(
|
||||
f"{quantization} quantization requires some operators from vllm. "
|
||||
"Pleaes install vllm by `pip install vllm==0.8.4`"
|
||||
"Please install vllm by `pip install vllm==0.8.4`"
|
||||
)
|
||||
|
||||
return QUANTIZATION_METHODS[quantization]
|
||||
|
||||
@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
|
||||
f"{input_size_per_partition} is not divisible by "
|
||||
f"weight quantization block_k = {block_k}."
|
||||
)
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
|
||||
output_partition_sizes
|
||||
) > 1:
|
||||
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
|
||||
def _compile_warning_1():
|
||||
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
|
||||
logger.warning(
|
||||
"Entering DeepGEMM JIT Pre-Complie session. "
|
||||
"Entering DeepGEMM JIT Pre-Compile session. "
|
||||
"And it may takes a long time(Typically 10-20 mins) "
|
||||
"if you have not run `sglang.compile_deep_gemm`. "
|
||||
"Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
" for pre-compilation to reduce the overhead if you have not run it before. "
|
||||
"For example: "
|
||||
"`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
|
||||
@@ -115,7 +115,7 @@ def _compile_warning_1():
|
||||
|
||||
def _compile_warning_2():
|
||||
logger.warning(
|
||||
"Entering DeepGEMM JIT Single Kernel Complie session. "
|
||||
"Entering DeepGEMM JIT Single Kernel Compile session. "
|
||||
"And it will makes inference throughput becomes flaky. "
|
||||
"Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
||||
" for pre-compilation to solve this issue. "
|
||||
@@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all(
|
||||
logger.info(
|
||||
f"Try DeepGEMM JIT Compiling for "
|
||||
f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
|
||||
f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
|
||||
f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
|
||||
)
|
||||
|
||||
# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
|
||||
|
||||
@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
f"{input_size_per_partition} is not divisible by "
|
||||
f"weight quantization block_k = {block_k}."
|
||||
)
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if (
|
||||
tp_size > 1 and output_size // output_size_per_partition == tp_size
|
||||
) or len(output_partition_sizes) > 1:
|
||||
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
|
||||
self.quant_config.weight_block_size[1],
|
||||
)
|
||||
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
|
||||
# Required by collum parallel or enabling merged weights
|
||||
# Required by column parallel or enabling merged weights
|
||||
if intermediate_size % block_n != 0:
|
||||
raise ValueError(
|
||||
f"The output_size of gate's and up's weight = "
|
||||
|
||||
@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
@@ -342,7 +342,7 @@ def _static_quant_fp8(
|
||||
y_s_repeat_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Information for float8
|
||||
fp8_min,
|
||||
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
|
||||
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
|
||||
else:
|
||||
# Default config
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_size[0],
|
||||
|
||||
@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
|
||||
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
|
||||
else:
|
||||
# Default config
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
|
||||
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_size[0],
|
||||
|
||||
@@ -100,7 +100,7 @@ class LoRAManager:
|
||||
self.configs[name] = LoRAConfig(path)
|
||||
self.hf_target_names.update(self.configs[name].target_modules)
|
||||
|
||||
# Target lora weight names for lora_a and lora_b modules repectively.
|
||||
# Target lora weight names for lora_a and lora_b modules respectively.
|
||||
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
|
||||
self.lora_weight_names: Set[Tuple[str]] = set(
|
||||
[get_stacked_name(module) for module in self.hf_target_names]
|
||||
|
||||
@@ -50,15 +50,15 @@ class LoRAMemoryPool:
|
||||
self.uid_to_buffer_id: Dict[Optional[str], int] = {}
|
||||
|
||||
# Buffer idx -> lora uid in memory pool
|
||||
# All uids are initalized as empty strings for empty buffer slots
|
||||
# Here we don't initalize to None since None is a valid uid
|
||||
# All uids are initialized as empty strings for empty buffer slots
|
||||
# Here we don't initialize to None since None is a valid uid
|
||||
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
|
||||
|
||||
def get_lora_A_shape(
|
||||
self, module_name: str, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
|
||||
c = get_stacked_multiply(module_name)
|
||||
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
|
||||
self, module_name: str, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
_, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
|
||||
c = get_stacked_multiply(module_name)
|
||||
|
||||
@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
|
||||
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
||||
)
|
||||
|
||||
# Iteate to compute the block in output matrix
|
||||
# Iterate to compute the block in output matrix
|
||||
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
|
||||
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x_tile = tl.load(
|
||||
|
||||
@@ -79,7 +79,7 @@ def get_hidden_dim(
|
||||
module_name: str, config: AutoConfig, base_model: torch.nn.Module
|
||||
) -> Tuple[int]:
|
||||
"""
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
|
||||
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
||||
"""
|
||||
|
||||
if hasattr(base_model, "get_hidden_dim"):
|
||||
|
||||
@@ -210,7 +210,7 @@ class DataParallelController:
|
||||
)
|
||||
# compute zmq ports for this dp rank
|
||||
rank_port_args = PortArgs.init_new(server_args, dp_rank)
|
||||
# Data parallelism resues the tensor parallelism group,
|
||||
# Data parallelism reuses the tensor parallelism group,
|
||||
# so all dp ranks should use the same nccl port.
|
||||
rank_port_args.nccl_port = port_args.nccl_port
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
The definition of objects transfered between different
|
||||
The definition of objects transferred between different
|
||||
processes (TokenizerManager, DetokenizerManager, Controller).
|
||||
"""
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
|
||||
self, input_ids: List[int], mm_inputs: MultimodalInputs
|
||||
) -> List[int]:
|
||||
"""
|
||||
This function will replace the data-tokens inbetween with pad_values accordingly
|
||||
This function will replace the data-tokens in between with pad_values accordingly
|
||||
"""
|
||||
pad_values = [item.pad_value for item in mm_inputs.mm_items]
|
||||
data_token_pairs = self.data_token_id_pairs
|
||||
|
||||
@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"{phase_str} out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {num_tokens} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
if self.tree_cache is not None:
|
||||
@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"Prefill out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {extend_num_tokens} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
|
||||
f"{self.tree_cache.evictable_size()=}\n"
|
||||
)
|
||||
@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
error_msg = (
|
||||
f"Decode out of memory. Try to lower your batch size.\n"
|
||||
f"Try to allocate {len(seq_lens)} tokens.\n"
|
||||
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
|
||||
f"{self.token_to_kv_pool_allocator.available_size()=}\n"
|
||||
f"{self.tree_cache.evictable_size()=}\n"
|
||||
)
|
||||
|
||||
@@ -1325,7 +1325,7 @@ class Scheduler(
|
||||
return None
|
||||
|
||||
running_bs = len(self.running_batch.reqs)
|
||||
# Igore the check if self.chunked_req is not None.
|
||||
# Ignore the check if self.chunked_req is not None.
|
||||
# In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
|
||||
# as the space for the chunked request has just been released.
|
||||
# In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
|
||||
|
||||
@@ -1273,7 +1273,7 @@ class TokenizerManager:
|
||||
self.model_update_result.set_result(recv_obj)
|
||||
else: # self.server_args.dp_size > 1
|
||||
self.model_update_tmp.append(recv_obj)
|
||||
# set future if the all results are recevied
|
||||
# set future if the all results are received
|
||||
if len(self.model_update_tmp) == self.server_args.dp_size:
|
||||
self.model_update_result.set_result(self.model_update_tmp)
|
||||
|
||||
|
||||
@@ -296,12 +296,12 @@ class CudaGraphRunner:
|
||||
self.capture()
|
||||
except RuntimeError as e:
|
||||
raise Exception(
|
||||
f"Capture cuda graph failed: {e}\n"
|
||||
f"Capture CUDA graph failed: {e}\n"
|
||||
"Possible solutions:\n"
|
||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||
)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
|
||||
DECODE = auto()
|
||||
# Contains both EXTEND and DECODE when doing chunked prefill.
|
||||
MIXED = auto()
|
||||
# No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
|
||||
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
|
||||
IDLE = auto()
|
||||
|
||||
# Used in speculative decoding: verify a batch in the target model.
|
||||
|
||||
@@ -188,7 +188,7 @@ def trunc_normal_tf_(
|
||||
best when :math:`a \\leq \text{mean} \\leq b`.
|
||||
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
|
||||
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
|
||||
and the result is subsquently scaled and shifted by the mean and std args.
|
||||
and the result is subsequently scaled and shifted by the mean and std args.
|
||||
Args:
|
||||
tensor: an n-dimensional `torch.Tensor`
|
||||
mean: the mean of the normal distribution
|
||||
@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
|
||||
img_size: Input image size.
|
||||
patch_size: Patch size.
|
||||
in_chans: Number of image input channels.
|
||||
num_classes: Mumber of classes for classification head.
|
||||
num_classes: Number of classes for classification head.
|
||||
global_pool: Type of global pooling for final sequence (default: 'token').
|
||||
embed_dim: Transformer embedding dimension.
|
||||
depth: Depth of transformer.
|
||||
|
||||
@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
|
||||
# Fully Connected
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
|
||||
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
|
||||
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
|
||||
# Scatter
|
||||
if self.dp_size != 1:
|
||||
# important: forward batch.gathered_buffer is used both after scatter and after gather.
|
||||
@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
else:
|
||||
assert (
|
||||
self.n_share_experts_fusion == self.tp_size
|
||||
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace."
|
||||
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
|
||||
elif self.n_share_experts_fusion == 0:
|
||||
if (
|
||||
_is_cuda
|
||||
@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
if is_nextn:
|
||||
if hasattr(self.config, "num_nextn_predict_layers"):
|
||||
num_nextn_layers = self.config.num_nextn_predict_layers
|
||||
assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
|
||||
assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
|
||||
# compatible with old design
|
||||
nextn_layer_id = (
|
||||
0
|
||||
|
||||
@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
|
||||
# Fully Connected
|
||||
hidden_states = self.feed_forward(hidden_states, forward_batch)
|
||||
|
||||
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
|
||||
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
|
||||
# Scatter
|
||||
if self.dp_size != 1:
|
||||
# important: forward batch.gathered_buffer is used both after scatter and after gather.
|
||||
|
||||
@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
|
||||
input_shape = input_ids.size()
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
|
||||
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
# Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
||||
|
||||
pos_list = []
|
||||
token_list = []
|
||||
|
||||
@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
|
||||
--tensor-parallel-size 2 \
|
||||
--disable-cuda-graph
|
||||
```
|
||||
We will eanble CUDA Graph support soon.
|
||||
We will enable CUDA Graph support soon.
|
||||
"""
|
||||
|
||||
import types
|
||||
|
||||
@@ -590,7 +590,7 @@ def v1_generate_response(
|
||||
echo = False
|
||||
|
||||
if (not isinstance(request, list)) and request.echo:
|
||||
# TODO: handle the case propmt is token ids
|
||||
# TODO: handle the case prompt is token ids
|
||||
if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
|
||||
# for the case of multiple str prompts
|
||||
prompts = request.prompt
|
||||
@@ -646,7 +646,7 @@ def v1_generate_response(
|
||||
finish_reason = ret_item["meta_info"]["finish_reason"]
|
||||
|
||||
if to_file:
|
||||
# to make the choise data json serializable
|
||||
# to make the choice data json serializable
|
||||
choice_data = {
|
||||
"index": 0,
|
||||
"text": text,
|
||||
|
||||
@@ -147,7 +147,7 @@ class ReasoningParser:
|
||||
|
||||
Args:
|
||||
model_type (str): Type of model to parse reasoning from
|
||||
stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
|
||||
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
||||
If True, streams reasoning content as it arrives.
|
||||
"""
|
||||
|
||||
|
||||
@@ -294,7 +294,7 @@ class SamplingBatchInfo:
|
||||
# Set the flag to True if any of the two has custom logit processor
|
||||
self.has_custom_logit_processor = True
|
||||
|
||||
# Note: becasue the __len()__ operator is defined on the temperatures tensor,
|
||||
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
||||
# please make sure any merge operation with len(self) or len(other) is done before
|
||||
# the merge operation of the temperatures tensor below.
|
||||
for item in [
|
||||
|
||||
@@ -825,7 +825,7 @@ class ServerArgs:
|
||||
# Multi-node distributed serving
|
||||
parser.add_argument(
|
||||
"--dist-init-addr",
|
||||
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
|
||||
"--nccl-init-addr", # For backward compatibility. This will be removed in the future.
|
||||
type=str,
|
||||
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
|
||||
)
|
||||
@@ -1096,7 +1096,7 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--triton-attention-reduce-in-fp32",
|
||||
action="store_true",
|
||||
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
||||
help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
|
||||
"This only affects Triton attention kernels.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -1188,7 +1188,7 @@ class ServerArgs:
|
||||
type=int,
|
||||
default=0,
|
||||
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
||||
"set it to tp_size can get best optimized performace.",
|
||||
"set it to tp_size can get best optimized performance.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-chunked-prefix-cache",
|
||||
|
||||
@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
|
||||
self.capture()
|
||||
except RuntimeError as e:
|
||||
raise Exception(
|
||||
f"Capture cuda graph failed: {e}\n"
|
||||
f"Capture CUDA graph failed: {e}\n"
|
||||
"Possible solutions:\n"
|
||||
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
||||
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
||||
"3. disable torch compile by not using --enable-torch-compile\n"
|
||||
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
||||
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
||||
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
||||
)
|
||||
|
||||
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
|
||||
|
||||
# Run and capture
|
||||
def run_once():
|
||||
# Backup two fileds, which will be modified in-place in `draft_forward`.
|
||||
# Backup two fields, which will be modified in-place in `draft_forward`.
|
||||
output_cache_loc_backup = forward_batch.out_cache_loc
|
||||
hidden_states_backup = forward_batch.spec_info.hidden_states
|
||||
|
||||
|
||||
@@ -167,12 +167,12 @@ class EagleVerifyOutput:
|
||||
draft_input: EagleDraftInput
|
||||
# Logit outputs from target worker
|
||||
logits_output: LogitsProcessorOutput
|
||||
# Accepeted token ids including the bonus token
|
||||
# Accepted token ids including the bonus token
|
||||
verified_id: torch.Tensor
|
||||
# Accepeted token length per sequence in a batch in CPU.
|
||||
# Accepted token length per sequence in a batch in CPU.
|
||||
accept_length_per_req_cpu: List[int]
|
||||
# Accepeted indices from logits_output.next_token_logits
|
||||
accepeted_indices: torch.Tensor
|
||||
# Accepted indices from logits_output.next_token_logits
|
||||
accepted_indices: torch.Tensor
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -316,7 +316,7 @@ class EagleVerifyInput:
|
||||
|
||||
This API updates values inside logits_output based on the accepted
|
||||
tokens. I.e., logits_output.next_token_logits only contains
|
||||
accepeted token logits.
|
||||
accepted token logits.
|
||||
"""
|
||||
bs = self.retrive_index.shape[0]
|
||||
candidates = self.draft_token.reshape(bs, self.draft_token_num)
|
||||
@@ -493,7 +493,7 @@ class EagleVerifyInput:
|
||||
logits_output=logits_output,
|
||||
verified_id=verified_id,
|
||||
accept_length_per_req_cpu=accept_length_cpu,
|
||||
accepeted_indices=accept_index,
|
||||
accepted_indices=accept_index,
|
||||
)
|
||||
else:
|
||||
assign_req_to_token_pool[(bs,)](
|
||||
@@ -539,7 +539,7 @@ class EagleVerifyInput:
|
||||
logits_output=logits_output,
|
||||
verified_id=verified_id,
|
||||
accept_length_per_req_cpu=accept_length_cpu,
|
||||
accepeted_indices=accept_index,
|
||||
accepted_indices=accept_index,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.has_prefill_wrapper_verify = False
|
||||
else:
|
||||
raise ValueError(
|
||||
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
|
||||
f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
|
||||
)
|
||||
|
||||
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
||||
@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
|
||||
Args:
|
||||
batch: The batch to run forward. The state of the batch is modified as it runs.
|
||||
Returns:
|
||||
A tuple of the final logit output of the target model, next tokens accepeted,
|
||||
the batch id (used for overlap schedule), and number of accepeted tokens.
|
||||
A tuple of the final logit output of the target model, next tokens accepted,
|
||||
the batch id (used for overlap schedule), and number of accepted tokens.
|
||||
"""
|
||||
if batch.forward_mode.is_decode():
|
||||
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
||||
@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
|
||||
)
|
||||
|
||||
# Post process based on verified outputs.
|
||||
# Pick indices that we care (accepeted)
|
||||
# Pick indices that we care (accepted)
|
||||
logits_output.next_token_logits = logits_output.next_token_logits[
|
||||
res.accepeted_indices
|
||||
res.accepted_indices
|
||||
]
|
||||
logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
|
||||
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
|
||||
|
||||
# Prepare the batch for the next draft forwards.
|
||||
batch.forward_mode = ForwardMode.DECODE
|
||||
@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.capture_for_decode(logits_output, forward_batch.spec_info)
|
||||
|
||||
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
||||
# Backup fileds that will be modified in-place
|
||||
# Backup fields that will be modified in-place
|
||||
seq_lens_backup = batch.seq_lens.clone()
|
||||
req_pool_indices_backup = batch.req_pool_indices
|
||||
accept_length_backup = batch.spec_info.accept_length
|
||||
|
||||
@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
|
||||
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
|
||||
except openai.BadRequestError as e:
|
||||
print("Bad Request Error", e)
|
||||
return ""
|
||||
|
||||
@@ -121,7 +121,7 @@ class HumanEval(Eval):
|
||||
convo=convo,
|
||||
metrics={
|
||||
f"pass@{k}": estimate_pass_at_k([total], [correct], k)
|
||||
# this will be aggrated so no need of .mean()
|
||||
# this will be aggregated so no need of .mean()
|
||||
for k in self._ks_passes
|
||||
if total >= k
|
||||
},
|
||||
|
||||
@@ -370,7 +370,7 @@ def test_dtype_gen():
|
||||
@sgl.function
|
||||
def dtype_gen(s):
|
||||
s += "Q: What is the full name of DNS?\n"
|
||||
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
||||
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
||||
s += "Q: Which year was DNS invented?\n"
|
||||
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
|
||||
s += "Q: What is the value of pi?\n"
|
||||
|
||||
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
|
||||
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
||||
)
|
||||
if signum == signal.SIGTERM:
|
||||
logger.info(f"{sub_module_name} recive sigterm")
|
||||
logger.info(f"{sub_module_name} receive sigterm")
|
||||
|
||||
signal.signal(signal.SIGTERM, graceful_shutdown)
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ pip install -e "python[all]"
|
||||
pip install torch_memory_saver
|
||||
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
|
||||
|
||||
# For compling xgrammar kernels
|
||||
# For compiling xgrammar kernels
|
||||
pip install cuda-python nvidia-cuda-nvrtc-cu12
|
||||
|
||||
# For lmms_evals evaluating MMMU
|
||||
|
||||
@@ -43,7 +43,7 @@ pip install -e "python[all]"
|
||||
pip install torch_memory_saver
|
||||
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
|
||||
|
||||
# For compling xgrammar kernels
|
||||
# For compiling xgrammar kernels
|
||||
pip install cuda-python nvidia-cuda-nvrtc-cu12
|
||||
|
||||
# For lmms_evals evaluating MMMU
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Convert Yi-VL config into a format useable with SGLang
|
||||
Convert Yi-VL config into a format usable with SGLang
|
||||
|
||||
Usage: python3 scripts/convert_yi_vl.py --model-path <path-to-model>
|
||||
"""
|
||||
|
||||
@@ -90,7 +90,7 @@ def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Export NextN layer paramerters for DeepSeek-V3/R1"
|
||||
description="Export NextN layer parameters for DeepSeek-V3/R1"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
|
||||
@@ -114,7 +114,7 @@ set(SGL_KERNEL_CUDA_FLAGS
|
||||
"--expt-extended-lambda"
|
||||
"--threads=32"
|
||||
|
||||
# Supress warnings
|
||||
# Suppress warnings
|
||||
"-Xcompiler=-Wconversion"
|
||||
"-Xcompiler=-fno-strict-aliasing"
|
||||
|
||||
|
||||
@@ -87,7 +87,7 @@ Third-party libraries:
|
||||
|
||||
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
|
||||
|
||||
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. Thats mean if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
|
||||
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
|
||||
|
||||
### Kernel Development
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ def _per_token_group_quant_8bit(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
|
||||
@@ -49,7 +49,7 @@ namespace {
|
||||
|
||||
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
|
||||
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CPU(x); \
|
||||
|
||||
@@ -718,7 +718,7 @@ void decode_attention_kernel_impl(
|
||||
|
||||
m_prime = m_i;
|
||||
|
||||
// caculate V' <- s_delta @ V + V' * m_delta
|
||||
// calculate V' <- s_delta @ V + V' * m_delta
|
||||
index_gemm_kernel_nn<scalar_t, index_t>(
|
||||
/* A */ s_delta,
|
||||
/* B */ v_buffer + head_id * v_strideH,
|
||||
@@ -925,7 +925,7 @@ void decode_attention_grouped_kernel_impl(
|
||||
m_prime[h] = m_i;
|
||||
}
|
||||
|
||||
// caculate V' <- s_delta @ V + V' * m_delta
|
||||
// calculate V' <- s_delta @ V + V' * m_delta
|
||||
index_gemm_kernel_nn<scalar_t, index_t>(
|
||||
/* A */ s_delta,
|
||||
/* B */ v_buffer + head_kv_id * v_strideH,
|
||||
|
||||
@@ -323,7 +323,7 @@ void extend_attention_kernel_impl(
|
||||
/* ld_src */ v_strideN,
|
||||
/* ld_dst */ head_size_v);
|
||||
|
||||
// caculate V' <- s_delta @ V + V'
|
||||
// calculate V' <- s_delta @ V + V'
|
||||
at::native::cpublas::brgemm(
|
||||
/* M */ m_size,
|
||||
/* N */ head_size_v,
|
||||
@@ -434,7 +434,7 @@ void extend_attention_kernel_impl(
|
||||
/* ld_src */ ve_strideN,
|
||||
/* ld_dst */ head_size_v);
|
||||
|
||||
// caculate V' <- s_delta @ V + V'
|
||||
// calculate V' <- s_delta @ V + V'
|
||||
at::native::cpublas::brgemm(
|
||||
/* M */ m_size,
|
||||
/* N */ head_size_v,
|
||||
|
||||
@@ -79,7 +79,7 @@ void fused_experts_int8_kernel_impl(
|
||||
int64_t topk,
|
||||
int64_t num_tokens_post_pad);
|
||||
|
||||
// shared expert implememntation for int8 w8a8
|
||||
// shared expert implementation for int8 w8a8
|
||||
template <typename scalar_t>
|
||||
void shared_expert_int8_kernel_impl(
|
||||
scalar_t* __restrict__ output,
|
||||
|
||||
@@ -51,7 +51,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
|
||||
__m512 vd0;
|
||||
__m512 vd1[COLS];
|
||||
|
||||
// oops! 4x4 spills but luckly we use 4x2
|
||||
// oops! 4x4 spills but we use 4x2
|
||||
__m512 vbias[COLS];
|
||||
|
||||
// [NOTE]: s8s8 igemm compensation in avx512-vnni
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// This is only a pluggin used for flashinfer 0.1.6. The new version does not need it.
|
||||
// This is only a plugin used for flashinfer 0.1.6. The new version does not need it.
|
||||
/*
|
||||
* Copyright (c) 2025 by SGLang team.
|
||||
* Copyright (c) 2025 by FlashInfer team.
|
||||
|
||||
@@ -20,16 +20,16 @@ limitations under the License.
|
||||
#include <torch/library.h>
|
||||
|
||||
/**
|
||||
* Unforunately, the type signatures of the flash_attn ops are not compatible
|
||||
* Unfortunately, the type signatures of the flash_attn ops are not compatible
|
||||
* with the PyTorch library bindings. To get around that we use
|
||||
* `make_pytorch_shim` which creates a lambda that exponses the API using
|
||||
* `make_pytorch_shim` which creates a lambda that exposes the API using
|
||||
* PyTorch compatible types to the types, then converts them to the types
|
||||
* expected by the flash_attn ops. This shims allows us to make minimal changes
|
||||
* to `flash_api.cpp` making it easier to synchronize with upstream changes.
|
||||
*
|
||||
* The `pytorch_library_compatible_type` struct is used to map from the
|
||||
* flash_attn ops types to a PyTorch library compatible one. The main issues is
|
||||
* that the following types are not support by PyTorch libary bindings:
|
||||
* that the following types are not support by PyTorch library bindings:
|
||||
* - `int`
|
||||
* - `float`
|
||||
* - `std::optional<T> &`
|
||||
|
||||
@@ -229,7 +229,7 @@ def apply_rope_with_cos_sin_cache_inplace(
|
||||
Whether to use Neox style RoPE, default: ``True``.
|
||||
|
||||
* If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
|
||||
we rorate the first half dimensions ``([..., :head_dim//2])`` and the second half
|
||||
we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
|
||||
dimensions ``([..., head_dim//2:])``.
|
||||
|
||||
* If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
|
||||
|
||||
@@ -17,7 +17,7 @@ def is_fa3_supported(device=None) -> bool:
|
||||
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
|
||||
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
|
||||
# Thats mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
|
||||
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
|
||||
return (
|
||||
torch.cuda.get_device_capability(device)[0] == 9
|
||||
or torch.cuda.get_device_capability(device)[0] == 8
|
||||
|
||||
@@ -45,10 +45,10 @@ def moe_fused_gate(
|
||||
):
|
||||
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
|
||||
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
|
||||
# as the group weight to select exerpt groups and then select topk experts within the selected groups
|
||||
# as the group weight to select expert groups and then select topk experts within the selected groups
|
||||
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
|
||||
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
|
||||
# for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
|
||||
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now.
|
||||
# for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
|
||||
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
|
||||
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
|
||||
return torch.ops.sgl_kernel.moe_fused_gate.default(
|
||||
|
||||
@@ -24,7 +24,7 @@ def is_fa3_supported(device=None) -> bool:
|
||||
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
|
||||
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
|
||||
# Thats mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
|
||||
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
|
||||
return (
|
||||
torch.cuda.get_device_capability(device)[0] == 9
|
||||
or torch.cuda.get_device_capability(device)[0] == 8
|
||||
|
||||
@@ -21,7 +21,7 @@ def _per_token_group_quant_fp8(
|
||||
y_s_ptr,
|
||||
# Stride of input
|
||||
y_stride,
|
||||
# Collums of input
|
||||
# Columns of input
|
||||
N,
|
||||
# Avoid to divide zero
|
||||
eps,
|
||||
|
||||
@@ -1070,7 +1070,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_utf8_split_seq() {
|
||||
// The string should be indexed and splitted by a utf-8 value basis instead of byte basis
|
||||
// The string should be indexed and split by a utf-8 value basis instead of byte basis
|
||||
// use .chars() to get the iterator of the utf-8 value
|
||||
let tree = Arc::new(Tree::new());
|
||||
|
||||
|
||||
@@ -433,7 +433,7 @@ class TestOpenAIServer(CustomTestCase):
|
||||
)
|
||||
|
||||
def test_completion_stream(self):
|
||||
# parallel sampling adn list input are not supported in streaming mode
|
||||
# parallel sampling and list input are not supported in streaming mode
|
||||
for echo in [False, True]:
|
||||
for logprobs in [None, 5]:
|
||||
for use_list_input in [True, False]:
|
||||
|
||||
@@ -161,7 +161,7 @@ class TestSessionControl(CustomTestCase):
|
||||
]
|
||||
)
|
||||
|
||||
# query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort
|
||||
# query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort
|
||||
response = requests.post(
|
||||
self.base_url + "/generate",
|
||||
json={
|
||||
@@ -668,7 +668,7 @@ class TestSessionControlVision(CustomTestCase):
|
||||
).json()
|
||||
outputs_from_session.append(response["text"])
|
||||
|
||||
# query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort
|
||||
# query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort
|
||||
response = requests.post(
|
||||
self.base_url + "/generate",
|
||||
json={
|
||||
|
||||
@@ -295,7 +295,7 @@ class TestSRTEndpoint(CustomTestCase):
|
||||
print(f"{output_top_logprobs=}")
|
||||
|
||||
# Parse results
|
||||
# This is becaues the grammar constraint allows all prefix tokens
|
||||
# This is because the grammar constraint allows all prefix tokens
|
||||
logprobs = [None] * 2
|
||||
for i in range(len(output_top_logprobs)):
|
||||
try:
|
||||
|
||||
@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
|
||||
|
||||
def test_1_quantization_args(self):
|
||||
|
||||
# we only test fp8 because other methods are currenly depend on vllm. We can add other methods back to test after vllm depency is resolved.
|
||||
# we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved.
|
||||
quantization_args_list = [
|
||||
# "awq",
|
||||
"fp8",
|
||||
|
||||
@@ -116,7 +116,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
|
||||
}},
|
||||
{{
|
||||
"type": "text",
|
||||
"text": "Whats in this picture?"
|
||||
"text": "What's in this picture?"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Reference in New Issue
Block a user