PropagationShield/handler.py

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import Dict, List, Any

class EndpointHandler:
    def __init__(self, path=""):
        """
        Initializes the model and tokenizer. 
        `path` is automatically provided by Hugging Face (it points to your repo files).
        """
        print("🚀 Initializing PropagationShield Handler...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        
        # 1. Configure 4-bit quantization to prevent OOM and System RAM limits
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        # 2. Load the model safely
        self.model = AutoModelForCausalLM.from_pretrained(
            path,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True, # Crucial to prevent the 30GB RAM crash during boot
        )
        print("✅ PropagationShield Loaded Successfully!")

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Runs inference on the incoming request.
        """
        # Parse incoming data
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})
        
        max_new_tokens = parameters.get("max_new_tokens", 512)
        temperature = parameters.get("temperature", 0.1)
        
        # 3. Format the prompt
        # If the user sends a list of messages [{"role": "system", "content": "..."}, ...]
        if isinstance(inputs, list):
            prompt = self.tokenizer.apply_chat_template(
                inputs, tokenize=False, add_generation_prompt=True
            )
        # If the user sends a raw formatted string
        else:
            prompt = str(inputs)
            
        # 4. Tokenize
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.device)
        
        # 5. Generate
        with torch.no_grad():
            output_ids = self.model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True if temperature > 0.0 else False,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
        # 6. Isolate and decode only the newly generated tokens
        generated_ids = output_ids[0][input_ids.shape[-1]:]
        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
        
        # Return in standard HF API format
        return [{"generated_text": generated_text.strip()}]
初始化项目，由ModelHub XC社区提供模型 Model: pragunk/PropagationShield Source: Original Platform 2026-04-28 05:15:06 +08:00			`import torch`
			`from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig`
			`from typing import Dict, List, Any`

			`class EndpointHandler:`
			`def __init__(self, path=""):`
			`"""`
			`Initializes the model and tokenizer.`
			`path` is automatically provided by Hugging Face (it points to your repo files).
			`"""`
			`print("🚀 Initializing PropagationShield Handler...")`

			`self.tokenizer = AutoTokenizer.from_pretrained(path)`

			`# 1. Configure 4-bit quantization to prevent OOM and System RAM limits`
			`bnb_config = BitsAndBytesConfig(`
			`load_in_4bit=True,`
			`bnb_4bit_use_double_quant=True,`
			`bnb_4bit_quant_type="nf4",`
			`bnb_4bit_compute_dtype=torch.float16`
			`)`

			`# 2. Load the model safely`
			`self.model = AutoModelForCausalLM.from_pretrained(`
			`path,`
			`quantization_config=bnb_config,`
			`device_map="auto",`
			`torch_dtype=torch.float16,`
			`low_cpu_mem_usage=True, # Crucial to prevent the 30GB RAM crash during boot`
			`)`
			`print("✅ PropagationShield Loaded Successfully!")`

			`def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:`
			`"""`
			`Runs inference on the incoming request.`
			`"""`
			`# Parse incoming data`
			`inputs = data.pop("inputs", data)`
			`parameters = data.pop("parameters", {})`

			`max_new_tokens = parameters.get("max_new_tokens", 512)`
			`temperature = parameters.get("temperature", 0.1)`

			`# 3. Format the prompt`
			`# If the user sends a list of messages [{"role": "system", "content": "..."}, ...]`
			`if isinstance(inputs, list):`
			`prompt = self.tokenizer.apply_chat_template(`
			`inputs, tokenize=False, add_generation_prompt=True`
			`)`
			`# If the user sends a raw formatted string`
			`else:`
			`prompt = str(inputs)`

			`# 4. Tokenize`
			`input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.model.device)`

			`# 5. Generate`
			`with torch.no_grad():`
			`output_ids = self.model.generate(`
			`input_ids,`
			`max_new_tokens=max_new_tokens,`
			`temperature=temperature,`
			`do_sample=True if temperature > 0.0 else False,`
			`pad_token_id=self.tokenizer.eos_token_id`
			`)`

			`# 6. Isolate and decode only the newly generated tokens`
			`generated_ids = output_ids[0][input_ids.shape[-1]:]`
			`generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)`

			`# Return in standard HF API format`
			`return [{"generated_text": generated_text.strip()}]`