Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
281
vllm/kernels/helion/config_manager.py
Normal file
281
vllm/kernels/helion/config_manager.py
Normal file
@@ -0,0 +1,281 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Configuration management for Helion kernels.
|
||||
|
||||
This module provides centralized configuration file management for Helion custom
|
||||
operations, including naming conventions, directory resolution, and file I/O.
|
||||
|
||||
Config File Structure
|
||||
---------------------
|
||||
Each kernel has a single JSON config file: {kernel_name}.json
|
||||
|
||||
The file uses a simplified 2-layer hierarchical structure:
|
||||
{
|
||||
"h100": { # GPU platform
|
||||
"default": { ... }, # Fallback configuration
|
||||
"batch_32_hidden_4096": { ... },
|
||||
"batch_64_hidden_8192": { ... }
|
||||
},
|
||||
"a100": {
|
||||
"default": { ... },
|
||||
"batch_16_hidden_2048": { ... }
|
||||
}
|
||||
}
|
||||
|
||||
Example file: silu_mul_fp8.json
|
||||
|
||||
Config keys should be structured strings that encode the relevant
|
||||
parameters (e.g., "batch_32_hidden_4096", "seq_512_heads_16", "fp8_batch_64", etc.).
|
||||
|
||||
Classes
|
||||
-------
|
||||
- ConfigSet: In-memory collection of configs for a kernel with lookup/query APIs.
|
||||
- ConfigManager: File-level operations for config persistence.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.import_utils import has_helion
|
||||
|
||||
if not has_helion():
|
||||
raise ImportError(
|
||||
"ConfigManager requires helion to be installed. "
|
||||
"Install it with: pip install helion"
|
||||
)
|
||||
|
||||
import helion
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ConfigSet:
|
||||
"""In-memory collection of Helion configs with lookup/query capabilities."""
|
||||
|
||||
# Type alias for nested config structure:
|
||||
# platform -> config_key -> helion.Config
|
||||
_ConfigDict = dict[str, dict[str, "helion.Config"]]
|
||||
|
||||
def __init__(self, kernel_name: str):
|
||||
self._kernel_name = kernel_name
|
||||
self._configs: ConfigSet._ConfigDict = {}
|
||||
|
||||
@property
|
||||
def kernel_name(self) -> str:
|
||||
return self._kernel_name
|
||||
|
||||
def get_config(self, platform: str, config_key: str) -> helion.Config:
|
||||
platform_dict = self._configs.get(platform)
|
||||
if platform_dict is None:
|
||||
avail_platforms = self.get_platforms()
|
||||
# TODO(@gmagogsfm): add a CLI/env override flag so users can
|
||||
# directly specify a platform name instead of relying on
|
||||
# auto-detection, and suggest it in this error message.
|
||||
raise KeyError(
|
||||
f"Config not found for kernel '{self._kernel_name}': "
|
||||
f"platform '{platform}' not found. "
|
||||
f"Available platforms: {avail_platforms or '(none)'}. "
|
||||
f"If your GPU is a variant of a supported platform, "
|
||||
f"consider adding a mapping in _GPU_NAME_ALIASES in "
|
||||
f"vllm/kernels/helion/utils.py, or run "
|
||||
f"scripts/autotune_helion_kernels.py to generate configs "
|
||||
f"for your platform."
|
||||
)
|
||||
|
||||
config = platform_dict.get(config_key)
|
||||
if config is None:
|
||||
avail_keys = self.get_config_keys(platform)
|
||||
raise KeyError(
|
||||
f"Config not found for kernel '{self._kernel_name}': "
|
||||
f"config_key '{config_key}' not found for platform '{platform}'. "
|
||||
f"Available config_keys: {avail_keys or '(none)'}"
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def get_platforms(self) -> list[str]:
|
||||
return sorted(self._configs.keys())
|
||||
|
||||
def get_config_keys(self, platform: str) -> list[str]:
|
||||
platform_dict = self._configs.get(platform.lower())
|
||||
if platform_dict is None:
|
||||
return []
|
||||
return sorted(platform_dict.keys())
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
result: dict[str, Any] = {}
|
||||
|
||||
for platform, config_keys_dict in self._configs.items():
|
||||
result[platform] = {}
|
||||
|
||||
for config_key, config in config_keys_dict.items():
|
||||
result[platform][config_key] = json.loads(config.to_json())
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> "ConfigSet":
|
||||
config_set = cls(kernel_name)
|
||||
count = 0
|
||||
|
||||
for platform, platform_data in data.items():
|
||||
if platform not in config_set._configs:
|
||||
config_set._configs[platform] = {}
|
||||
|
||||
for config_key, config_data in platform_data.items():
|
||||
config = helion.Config(**config_data)
|
||||
config_set._configs[platform][config_key] = config
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
logger.debug(
|
||||
"Loaded %d configs for kernel '%s'",
|
||||
count,
|
||||
kernel_name,
|
||||
)
|
||||
|
||||
return config_set
|
||||
|
||||
def set_config(
|
||||
self, platform: str, config_key: str, config: "helion.Config"
|
||||
) -> None:
|
||||
platform = platform.lower()
|
||||
if platform not in self._configs:
|
||||
self._configs[platform] = {}
|
||||
self._configs[platform][config_key] = config
|
||||
logger.debug(
|
||||
"Set config for kernel '%s': platform='%s', key='%s'",
|
||||
self._kernel_name,
|
||||
platform,
|
||||
config_key,
|
||||
)
|
||||
|
||||
def has_config(self, platform: str, config_key: str) -> bool:
|
||||
platform = platform.lower()
|
||||
platform_dict = self._configs.get(platform)
|
||||
if platform_dict is None:
|
||||
return False
|
||||
return config_key in platform_dict
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""File-level configuration management for Helion kernels (global singleton)."""
|
||||
|
||||
_instance: "ConfigManager | None" = None
|
||||
_instance_base_dir: Path | None = None
|
||||
|
||||
def __new__(cls, base_dir: str | Path | None = None) -> "ConfigManager":
|
||||
resolved_base_dir = cls._resolve_base_dir(base_dir)
|
||||
|
||||
if cls._instance is not None:
|
||||
if cls._instance_base_dir != resolved_base_dir:
|
||||
raise ValueError(
|
||||
f"ConfigManager singleton already exists with base_dir "
|
||||
f"'{cls._instance_base_dir}', cannot create with different "
|
||||
f"base_dir '{resolved_base_dir}'"
|
||||
)
|
||||
return cls._instance
|
||||
|
||||
instance = super().__new__(cls)
|
||||
cls._instance = instance
|
||||
cls._instance_base_dir = resolved_base_dir
|
||||
return instance
|
||||
|
||||
def __init__(self, base_dir: str | Path | None = None):
|
||||
if hasattr(self, "_base_dir"):
|
||||
return
|
||||
|
||||
self._base_dir = self._resolve_base_dir(base_dir)
|
||||
logger.debug("ConfigManager initialized with base_dir: %s", self._base_dir)
|
||||
|
||||
@staticmethod
|
||||
def _resolve_base_dir(base_dir: str | Path | None) -> Path:
|
||||
if base_dir is not None:
|
||||
return Path(base_dir).resolve()
|
||||
return (Path(__file__).parent / "configs").resolve()
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls) -> "ConfigManager":
|
||||
if cls._instance is None:
|
||||
raise RuntimeError(
|
||||
"ConfigManager instance has not been created. "
|
||||
"Call ConfigManager(base_dir=...) first to initialize."
|
||||
)
|
||||
return cls._instance
|
||||
|
||||
@classmethod
|
||||
def reset_instance(cls) -> None:
|
||||
"""For testing purposes only."""
|
||||
cls._instance = None
|
||||
cls._instance_base_dir = None
|
||||
|
||||
def get_config_file_path(self, kernel_name: str) -> Path:
|
||||
return self._base_dir / f"{kernel_name}.json"
|
||||
|
||||
def ensure_base_dir_exists(self) -> Path:
|
||||
self._base_dir.mkdir(parents=True, exist_ok=True)
|
||||
return self._base_dir
|
||||
|
||||
def ensure_base_dir_writable(self) -> None:
|
||||
self.ensure_base_dir_exists()
|
||||
test_file = self._base_dir / ".write_test"
|
||||
try:
|
||||
test_file.write_text("test")
|
||||
test_file.unlink()
|
||||
except OSError as e:
|
||||
raise OSError(
|
||||
f"Config directory '{self._base_dir}' is not writable: {e}"
|
||||
) from e
|
||||
|
||||
def load_config_set(self, kernel_name: str) -> ConfigSet:
|
||||
config_path = self.get_config_file_path(kernel_name)
|
||||
if not config_path.exists():
|
||||
return ConfigSet.from_dict(kernel_name, {})
|
||||
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
data = json.load(f)
|
||||
return ConfigSet.from_dict(kernel_name, data)
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
logger.error("Failed to load config file %s: %s", config_path, e)
|
||||
return ConfigSet.from_dict(kernel_name, {})
|
||||
|
||||
def get_platform_configs(
|
||||
self, kernel_name: str, platform: str
|
||||
) -> dict[str, helion.Config]:
|
||||
config_set = self.load_config_set(kernel_name)
|
||||
config_keys = config_set.get_config_keys(platform)
|
||||
|
||||
return {
|
||||
config_key: config_set.get_config(platform, config_key)
|
||||
for config_key in config_keys
|
||||
}
|
||||
|
||||
def save_config_set(self, config_set: ConfigSet) -> Path:
|
||||
config_path = self.get_config_file_path(config_set.kernel_name)
|
||||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config_set.to_dict(), f, indent=2)
|
||||
|
||||
logger.info("Saved config to: %s", config_path)
|
||||
return config_path
|
||||
|
||||
def save_configs(
|
||||
self,
|
||||
kernel_name: str,
|
||||
platform: str,
|
||||
configs: dict[str, "helion.Config"],
|
||||
) -> Path:
|
||||
"""Save configs for a kernel/platform, merging with existing."""
|
||||
config_set = self.load_config_set(kernel_name)
|
||||
for config_key, config in configs.items():
|
||||
config_set.set_config(platform, config_key, config)
|
||||
return self.save_config_set(config_set)
|
||||
|
||||
def config_exists(self, kernel_name: str, platform: str, config_key: str) -> bool:
|
||||
config_set = self.load_config_set(kernel_name)
|
||||
return config_set.has_config(platform, config_key)
|
||||
Reference in New Issue
Block a user