transformer-1b-chat/training_code/model/config.py

"""
Configuration for 1B parameter LLaMA-style Transformer model.
Architecture: Decoder-only Transformer with RoPE, GQA, SwiGLU, RMSNorm.
"""

from dataclasses import dataclass


@dataclass
class ModelConfig:
    vocab_size: int = 32000
    hidden_dim: int = 2048
    intermediate_dim: int = 5504       # ~2.7x hidden for SwiGLU (adjusted for param count)
    num_layers: int = 22
    num_attention_heads: int = 32
    num_kv_heads: int = 8              # GQA: 4 query heads per KV head
    max_seq_len: int = 2048
    rope_theta: float = 10000.0
    rms_norm_eps: float = 1e-5
    dropout: float = 0.0               # No dropout (modern practice for pretraining)
    tie_word_embeddings: bool = False

    @property
    def head_dim(self) -> int:
        return self.hidden_dim // self.num_attention_heads

    @property
    def num_params_approx(self) -> int:
        """Rough parameter count estimate."""
        embed = self.vocab_size * self.hidden_dim
        attn_per_layer = (
            self.hidden_dim * self.head_dim * self.num_attention_heads +  # Q
            self.hidden_dim * self.head_dim * self.num_kv_heads +         # K
            self.hidden_dim * self.head_dim * self.num_kv_heads +         # V
            self.head_dim * self.num_attention_heads * self.hidden_dim    # O
        )
        ffn_per_layer = 3 * self.hidden_dim * self.intermediate_dim      # gate + up + down
        norm_per_layer = 2 * self.hidden_dim
        total = (
            embed +
            self.num_layers * (attn_per_layer + ffn_per_layer + norm_per_layer) +
            self.hidden_dim +  # final norm
            (0 if self.tie_word_embeddings else self.vocab_size * self.hidden_dim)
        )
        return total


@dataclass
class TrainConfig:
    # Paths
    checkpoint_dir: str = "/jfs/deepak-kumar/checkpoints"
    data_cache_dir: str = "/jfs/deepak-kumar/data"
    log_dir: str = "/home/jovyan/training/logs"

    # Training
    total_tokens: int = 20_000_000_000   # 20B tokens
    batch_size_per_gpu: int = 8
    gradient_accumulation_steps: int = 8  # effective batch = 8 * 8 * 8 = 512 seqs
    max_seq_len: int = 2048
    
    # WSD Schedule
    learning_rate: float = 3e-4
    min_lr: float = 3e-5
    warmup_steps: int = 1000
    weight_decay: float = 0.1
    beta1: float = 0.9
    beta2: float = 0.95
    grad_clip: float = 1.0

    # Logging
    log_interval: int = 10
    save_interval: int = 1000
    eval_interval: int = 500

    # System
    num_workers: int = 4
    seed: int = 42
    bf16: bool = True
初始化项目，由ModelHub XC社区提供模型 Model: divakar-yadav/transformer-1b-chat Source: Original Platform 2026-06-20 17:27:58 +08:00			`"""`
			`Configuration for 1B parameter LLaMA-style Transformer model.`
			`Architecture: Decoder-only Transformer with RoPE, GQA, SwiGLU, RMSNorm.`
			`"""`

			`from dataclasses import dataclass`


			`@dataclass`
			`class ModelConfig:`
			`vocab_size: int = 32000`
			`hidden_dim: int = 2048`
			`intermediate_dim: int = 5504 # ~2.7x hidden for SwiGLU (adjusted for param count)`
			`num_layers: int = 22`
			`num_attention_heads: int = 32`
			`num_kv_heads: int = 8 # GQA: 4 query heads per KV head`
			`max_seq_len: int = 2048`
			`rope_theta: float = 10000.0`
			`rms_norm_eps: float = 1e-5`
			`dropout: float = 0.0 # No dropout (modern practice for pretraining)`
			`tie_word_embeddings: bool = False`

			`@property`
			`def head_dim(self) -> int:`
			`return self.hidden_dim // self.num_attention_heads`

			`@property`
			`def num_params_approx(self) -> int:`
			`"""Rough parameter count estimate."""`
			`embed = self.vocab_size * self.hidden_dim`
			`attn_per_layer = (`
			`self.hidden_dim * self.head_dim * self.num_attention_heads + # Q`
			`self.hidden_dim * self.head_dim * self.num_kv_heads + # K`
			`self.hidden_dim * self.head_dim * self.num_kv_heads + # V`
			`self.head_dim * self.num_attention_heads * self.hidden_dim # O`
			`)`
			`ffn_per_layer = 3 * self.hidden_dim * self.intermediate_dim # gate + up + down`
			`norm_per_layer = 2 * self.hidden_dim`
			`total = (`
			`embed +`
			`self.num_layers * (attn_per_layer + ffn_per_layer + norm_per_layer) +`
			`self.hidden_dim + # final norm`
			`(0 if self.tie_word_embeddings else self.vocab_size * self.hidden_dim)`
			`)`
			`return total`


			`@dataclass`
			`class TrainConfig:`
			`# Paths`
			`checkpoint_dir: str = "/jfs/deepak-kumar/checkpoints"`
			`data_cache_dir: str = "/jfs/deepak-kumar/data"`
			`log_dir: str = "/home/jovyan/training/logs"`

			`# Training`
			`total_tokens: int = 20_000_000_000 # 20B tokens`
			`batch_size_per_gpu: int = 8`
			`gradient_accumulation_steps: int = 8 # effective batch = 8 * 8 * 8 = 512 seqs`
			`max_seq_len: int = 2048`

			`# WSD Schedule`
			`learning_rate: float = 3e-4`
			`min_lr: float = 3e-5`
			`warmup_steps: int = 1000`
			`weight_decay: float = 0.1`
			`beta1: float = 0.9`
			`beta2: float = 0.95`
			`grad_clip: float = 1.0`

			`# Logging`
			`log_interval: int = 10`
			`save_interval: int = 1000`
			`eval_interval: int = 500`

			`# System`
			`num_workers: int = 4`
			`seed: int = 42`
			`bf16: bool = True`