InstructPalmyra-20b/handler.py

import torch
from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# check for GPU
device = 0 if torch.cuda.is_available() else -1


format_input = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:"
)


class EndpointHandler:
    def __init__(self, path=""):
        # load the model
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForCausalLM.from_pretrained(
            path,
            device_map="auto",
            torch_dtype=torch.float16,
        )
        # create inference pipeline
        self.pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=device,
            max_length=256,
        )

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", None)

        text_input = format_input.format(instruction=inputs)

        # pass inputs with all kwargs in data
        if parameters is not None:
            prediction = self.pipeline(text_input, **parameters)
        else:
            prediction = self.pipeline(text_input)

        # postprocess the prediction
        output = [
            {"generated_text": pred["generated_text"].split("### Response:")[1].strip()}
            for pred in prediction
        ]

        return output
初始化项目，由ModelHub XC社区提供模型 Model: Writer/InstructPalmyra-20b Source: Original Platform 2026-06-08 12:02:26 +08:00			`import torch`
			`from typing import Dict, List, Any`
			`from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline`

			`# check for GPU`
			`device = 0 if torch.cuda.is_available() else -1`


			`format_input = (`
			`"Below is an instruction that describes a task. "`
			`"Write a response that appropriately completes the request.\n\n"`
			`"### Instruction:\n{instruction}\n\n### Response:"`
			`)`


			`class EndpointHandler:`
			`def __init__(self, path=""):`
			`# load the model`
			`tokenizer = AutoTokenizer.from_pretrained(path)`
			`model = AutoModelForCausalLM.from_pretrained(`
			`path,`
			`device_map="auto",`
			`torch_dtype=torch.float16,`
			`)`
			`# create inference pipeline`
			`self.pipeline = pipeline(`
			`"text-generation",`
			`model=model,`
			`tokenizer=tokenizer,`
			`device=device,`
			`max_length=256,`
			`)`

			`def __call__(self, data: Any) -> List[List[Dict[str, float]]]:`
			`inputs = data.pop("inputs", data)`
			`parameters = data.pop("parameters", None)`

			`text_input = format_input.format(instruction=inputs)`

			`# pass inputs with all kwargs in data`
			`if parameters is not None:`
			`prediction = self.pipeline(text_input, **parameters)`
			`else:`
			`prediction = self.pipeline(text_input)`

			`# postprocess the prediction`
			`output = [`
			`{"generated_text": pred["generated_text"].split("### Response:")[1].strip()}`
			`for pred in prediction`
			`]`

			`return output`