minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)
This commit is contained in:
@@ -134,4 +134,4 @@ def method_has_implemented_embedding(
|
||||
class_embedding = inspect.getattr_static(method_class, "embedding", None)
|
||||
|
||||
return (class_embedding is not None
|
||||
and class_embedding is not base_embedding)
|
||||
and class_embedding is not base_embedding)
|
||||
|
||||
@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
|
||||
"""Get a mapping that can be used to reindex the gathered
|
||||
logits for sampling.
|
||||
|
||||
|
||||
During sampling, we gather logits from all ranks. The relationship
|
||||
of index->token_id will follow the same format as outlined in the class
|
||||
docstring. However, after the gather, we want to reindex the final
|
||||
@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding):
|
||||
|
||||
def forward(self, input_):
|
||||
del input_
|
||||
raise RuntimeError("LMHead's weights should be used in the sampler.")
|
||||
raise RuntimeError("LMHead's weights should be used in the sampler.")
|
||||
|
||||
@@ -838,7 +838,7 @@ class Scheduler:
|
||||
time_per_output_tokens_iter: List[float] = []
|
||||
|
||||
# Request stats
|
||||
# Decode
|
||||
# Decode
|
||||
gen_throughput: float = 0.0
|
||||
# Latency
|
||||
time_e2e_requests: List[float] = []
|
||||
@@ -866,11 +866,11 @@ class Scheduler:
|
||||
time_waiting_requests.append(req.queued_time - req.created_time)
|
||||
num_prompt_tokens_requests.append(len(req.origin_input_ids))
|
||||
num_generation_tokens_requests.append(len(req.output_ids))
|
||||
finished_reason_requests.append(
|
||||
finished_reason_requests.append(
|
||||
req.finished_reason.to_json()
|
||||
if req.finished_reason is not None
|
||||
else None)
|
||||
|
||||
|
||||
return Stats(
|
||||
new_seq=new_seq,
|
||||
num_running_req=num_running_req,
|
||||
|
||||
@@ -384,7 +384,7 @@ class TokenizerManager:
|
||||
obj.load_format = self.server_args.load_format
|
||||
|
||||
if not self.model_update_lock.locked():
|
||||
|
||||
|
||||
async with self.model_update_lock:
|
||||
# wait for the previous generation requests to finish
|
||||
while len(self.rid_to_state) > 0:
|
||||
|
||||
@@ -151,7 +151,7 @@ class Metrics:
|
||||
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5
|
||||
])
|
||||
|
||||
|
||||
# Request Stats
|
||||
# Metadata
|
||||
self.num_prompt_tokens_requests = Histogram(
|
||||
@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector):
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
|
||||
|
||||
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
|
||||
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
|
||||
@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
buckets.append(value)
|
||||
else:
|
||||
return buckets
|
||||
exponent += 1
|
||||
exponent += 1
|
||||
|
||||
@@ -54,4 +54,4 @@ class Stats:
|
||||
num_prompt_tokens_iter: int = 0
|
||||
num_generation_tokens_iter: int = 0
|
||||
time_to_first_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
|
||||
@@ -17,7 +17,7 @@ limitations under the License.
|
||||
"""
|
||||
Utilities for multi-modal models.
|
||||
|
||||
This python file mainly contains utilities that were used in the
|
||||
This python file mainly contains utilities that were used in the
|
||||
image processing logic of llava-next including operations such as
|
||||
anyres and anyres_max
|
||||
|
||||
|
||||
@@ -136,7 +136,7 @@ class GPT2Block(nn.Module):
|
||||
layer_id: int,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
|
||||
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module):
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
EntryClass = GPT2LMHeadModel
|
||||
EntryClass = GPT2LMHeadModel
|
||||
|
||||
0
python/sglang/srt/models/olmo.py
Executable file → Normal file
0
python/sglang/srt/models/olmo.py
Executable file → Normal file
@@ -57,27 +57,27 @@ logger = init_logger(__name__)
|
||||
|
||||
class Qwen2VLImageInputs(TypedDict):
|
||||
pixel_values: torch.Tensor
|
||||
"""Shape:
|
||||
"""Shape:
|
||||
`(num_patches, num_channels * patch_size * patch_size)`
|
||||
"""
|
||||
|
||||
image_grid_thw: torch.Tensor
|
||||
"""Shape: `(num_images, 3)`
|
||||
|
||||
|
||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
||||
"""
|
||||
|
||||
|
||||
class Qwen2VLVideoInputs(TypedDict):
|
||||
pixel_values_videos: torch.Tensor
|
||||
"""Shape:
|
||||
`(num_patches,
|
||||
"""Shape:
|
||||
`(num_patches,
|
||||
num_channels * temporal_patch_size * patch_size * patch_size)`
|
||||
"""
|
||||
|
||||
video_grid_thw: torch.Tensor
|
||||
"""Shape: `(num_videos, 3)`
|
||||
|
||||
|
||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
||||
"""
|
||||
|
||||
|
||||
@@ -759,7 +759,7 @@ class Engine:
|
||||
|
||||
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
||||
atexit.register(self.shutdown)
|
||||
|
||||
|
||||
# runtime server default log level is log
|
||||
# offline engine works in scripts, so we set it to error
|
||||
|
||||
|
||||
Reference in New Issue
Block a user