# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Configuration management for Helion kernels. This module provides centralized configuration file management for Helion custom operations, including naming conventions, directory resolution, and file I/O. Config File Structure --------------------- Each kernel has a single JSON config file: {kernel_name}.json The file uses a simplified 2-layer hierarchical structure: { "h100": { # GPU platform "default": { ... }, # Fallback configuration "batch_32_hidden_4096": { ... }, "batch_64_hidden_8192": { ... } }, "a100": { "default": { ... }, "batch_16_hidden_2048": { ... } } } Example file: silu_mul_fp8.json Config keys should be structured strings that encode the relevant parameters (e.g., "batch_32_hidden_4096", "seq_512_heads_16", "fp8_batch_64", etc.). Classes ------- - ConfigSet: In-memory collection of configs for a kernel with lookup/query APIs. - ConfigManager: File-level operations for config persistence. """ import json from pathlib import Path from typing import Any from vllm.logger import init_logger from vllm.utils.import_utils import has_helion if not has_helion(): raise ImportError( "ConfigManager requires helion to be installed. " "Install it with: pip install helion" ) import helion logger = init_logger(__name__) class ConfigSet: """In-memory collection of Helion configs with lookup/query capabilities.""" # Type alias for nested config structure: # platform -> config_key -> helion.Config _ConfigDict = dict[str, dict[str, "helion.Config"]] def __init__(self, kernel_name: str): self._kernel_name = kernel_name self._configs: ConfigSet._ConfigDict = {} @property def kernel_name(self) -> str: return self._kernel_name def get_config(self, platform: str, config_key: str) -> helion.Config: platform_dict = self._configs.get(platform) if platform_dict is None: avail_platforms = self.get_platforms() # TODO(@gmagogsfm): add a CLI/env override flag so users can # directly specify a platform name instead of relying on # auto-detection, and suggest it in this error message. raise KeyError( f"Config not found for kernel '{self._kernel_name}': " f"platform '{platform}' not found. " f"Available platforms: {avail_platforms or '(none)'}. " f"If your GPU is a variant of a supported platform, " f"consider adding a mapping in _GPU_NAME_ALIASES in " f"vllm/kernels/helion/utils.py, or run " f"scripts/autotune_helion_kernels.py to generate configs " f"for your platform." ) config = platform_dict.get(config_key) if config is None: avail_keys = self.get_config_keys(platform) raise KeyError( f"Config not found for kernel '{self._kernel_name}': " f"config_key '{config_key}' not found for platform '{platform}'. " f"Available config_keys: {avail_keys or '(none)'}" ) return config def get_platforms(self) -> list[str]: return sorted(self._configs.keys()) def get_config_keys(self, platform: str) -> list[str]: platform_dict = self._configs.get(platform.lower()) if platform_dict is None: return [] return sorted(platform_dict.keys()) def to_dict(self) -> dict[str, Any]: result: dict[str, Any] = {} for platform, config_keys_dict in self._configs.items(): result[platform] = {} for config_key, config in config_keys_dict.items(): result[platform][config_key] = json.loads(config.to_json()) return result @classmethod def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> "ConfigSet": config_set = cls(kernel_name) count = 0 for platform, platform_data in data.items(): if platform not in config_set._configs: config_set._configs[platform] = {} for config_key, config_data in platform_data.items(): config = helion.Config(**config_data) config_set._configs[platform][config_key] = config count += 1 if count > 0: logger.debug( "Loaded %d configs for kernel '%s'", count, kernel_name, ) return config_set def set_config( self, platform: str, config_key: str, config: "helion.Config" ) -> None: platform = platform.lower() if platform not in self._configs: self._configs[platform] = {} self._configs[platform][config_key] = config logger.debug( "Set config for kernel '%s': platform='%s', key='%s'", self._kernel_name, platform, config_key, ) def has_config(self, platform: str, config_key: str) -> bool: platform = platform.lower() platform_dict = self._configs.get(platform) if platform_dict is None: return False return config_key in platform_dict class ConfigManager: """File-level configuration management for Helion kernels (global singleton).""" _instance: "ConfigManager | None" = None _instance_base_dir: Path | None = None def __new__(cls, base_dir: str | Path | None = None) -> "ConfigManager": resolved_base_dir = cls._resolve_base_dir(base_dir) if cls._instance is not None: if cls._instance_base_dir != resolved_base_dir: raise ValueError( f"ConfigManager singleton already exists with base_dir " f"'{cls._instance_base_dir}', cannot create with different " f"base_dir '{resolved_base_dir}'" ) return cls._instance instance = super().__new__(cls) cls._instance = instance cls._instance_base_dir = resolved_base_dir return instance def __init__(self, base_dir: str | Path | None = None): if hasattr(self, "_base_dir"): return self._base_dir = self._resolve_base_dir(base_dir) logger.debug("ConfigManager initialized with base_dir: %s", self._base_dir) @staticmethod def _resolve_base_dir(base_dir: str | Path | None) -> Path: if base_dir is not None: return Path(base_dir).resolve() return (Path(__file__).parent / "configs").resolve() @classmethod def get_instance(cls) -> "ConfigManager": if cls._instance is None: raise RuntimeError( "ConfigManager instance has not been created. " "Call ConfigManager(base_dir=...) first to initialize." ) return cls._instance @classmethod def reset_instance(cls) -> None: """For testing purposes only.""" cls._instance = None cls._instance_base_dir = None def get_config_file_path(self, kernel_name: str) -> Path: return self._base_dir / f"{kernel_name}.json" def ensure_base_dir_exists(self) -> Path: self._base_dir.mkdir(parents=True, exist_ok=True) return self._base_dir def ensure_base_dir_writable(self) -> None: self.ensure_base_dir_exists() test_file = self._base_dir / ".write_test" try: test_file.write_text("test") test_file.unlink() except OSError as e: raise OSError( f"Config directory '{self._base_dir}' is not writable: {e}" ) from e def load_config_set(self, kernel_name: str) -> ConfigSet: config_path = self.get_config_file_path(kernel_name) if not config_path.exists(): return ConfigSet.from_dict(kernel_name, {}) try: with open(config_path) as f: data = json.load(f) return ConfigSet.from_dict(kernel_name, data) except (json.JSONDecodeError, OSError) as e: logger.error("Failed to load config file %s: %s", config_path, e) return ConfigSet.from_dict(kernel_name, {}) def get_platform_configs( self, kernel_name: str, platform: str ) -> dict[str, helion.Config]: config_set = self.load_config_set(kernel_name) config_keys = config_set.get_config_keys(platform) return { config_key: config_set.get_config(platform, config_key) for config_key in config_keys } def save_config_set(self, config_set: ConfigSet) -> Path: config_path = self.get_config_file_path(config_set.kernel_name) config_path.parent.mkdir(parents=True, exist_ok=True) with open(config_path, "w") as f: json.dump(config_set.to_dict(), f, indent=2) logger.info("Saved config to: %s", config_path) return config_path def save_configs( self, kernel_name: str, platform: str, configs: dict[str, "helion.Config"], ) -> Path: """Save configs for a kernel/platform, merging with existing.""" config_set = self.load_config_set(kernel_name) for config_key, config in configs.items(): config_set.set_config(platform, config_key, config) return self.save_config_set(config_set) def config_exists(self, kernel_name: str, platform: str, config_key: str) -> bool: config_set = self.load_config_set(kernel_name) return config_set.has_config(platform, config_key)