import torch from transformers import AutoTokenizer, AutoModelForCausalLM class EndpointHandler: def __init__(self, path=""): self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained( path, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto", ).eval() def _build_prompt(self, data): """ Prefer OpenAI-style chat messages: { "messages": [ {"role":"system","content":"..."}, {"role":"user","content":"..."} ] } Fallback to HF-style raw inputs: { "inputs": "..." } """ messages = data.get("messages", None) if isinstance(messages, list) and len(messages) > 0: # Qwen-native formatting (matches your local_run_*.py behavior) return self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) # fallback return data.get("inputs", "") or "" def __call__(self, data): p = data.get("parameters", {}) or {} max_new_tokens = int(p.get("max_new_tokens", 256)) temperature = float(p.get("temperature", 0.7)) top_p = float(p.get("top_p", 0.9)) text = self._build_prompt(data) # guardrail truncation to avoid huge payloads text = text[:12000] inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device) do_sample = temperature > 0.0 gen_kwargs = dict( max_new_tokens=max_new_tokens, do_sample=do_sample, use_cache=True, ) if do_sample: gen_kwargs.update(dict(temperature=temperature, top_p=top_p)) with torch.inference_mode(): out = self.model.generate(**inputs, **gen_kwargs) gen = out[0, inputs["input_ids"].shape[1]:] return [{"generated_text": self.tokenizer.decode(gen, skip_special_tokens=True).strip()}]