初始化项目，由ModelHub XC社区提供模型

Model: lamm-mit/BioinspiredZephyr-7B Source: Original Platform
2026-05-31 11:20:18 +08:00
commit 0c4d325ab4
16 changed files with 91754 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,160 @@
+---
+license: apache-2.0
+---
+### BioinspiredZephyr-7B: Large Language Model for the Mechanics of Biological and Bio-Inspired Materials 
+
+To accelerate discovery and guide insights, we report an open-source autoregressive transformer large language model (LLM), trained on expert knowledge in the biological materials field, especially focused on mechanics and structural properties.
+
+The model is finetuned with a corpus of over a thousand peer-reviewed articles in the field of structural biological and bio-inspired materials and can be prompted to recall information, assist with research tasks, and function as an engine for creativity.
+
+The model is based on HuggingFaceH4/zephyr-7b-beta.
+
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/623ce1c6b66fedf374859fe7/bsqBByauWBZ0Y8PspthR8.png)
+
+This model is based on work reported in https://doi.org/10.1002/advs.202306724.
+
+This repository includes both, Hugging Face transformers and GGUF files (in different versions, the q5_K_M is recommended). 
+
+#### Hugging Face transformers files: Loading and inference
+
+```
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from accelerate import infer_auto_device_map
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    device_map="auto", #device_map="cuda:0",
+    torch_dtype=  torch.bfloat16,
+    # use_flash_attention_2=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+Chat template
+```
+messages = [
+    {"role": "system", "content": "You are a friendly materials scientist."},
+    {"role": "user", "content": "What is the strongest spider silk material?"},
+    {"role": "assistant", "content": "Sample response."},
+]
+prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+```
+
+'<|system|>\nYou are a friendly materials scientist.</s>\n<|user|>\nWhat is the strongest spider silk material?</s>\n<|assistant|>\nSample response.</s>\n<|assistant|>\n'
+
+```
+device='cuda'
+def generate_response (text_input="Biological materials offer amazing possibilities, such as",
+                      num_return_sequences=1,
+                      temperature=1.,  
+                      max_new_tokens=127,
+                      num_beams=1,
+                      top_k = 50,
+                      top_p =0.9,repetition_penalty=1.,eos_token_id=2,verbatim=False,
+                      exponential_decay_length_penalty_fac=None,
+                      ):
+
+    inputs = tokenizer.encode(text_input,  add_special_tokens  =False,  return_tensors ='pt')
+    if verbatim:
+        print ("Length of input, tokenized: ", inputs.shape, inputs)
+    with torch.no_grad():
+          outputs = model.generate(input_ids=inputs.to(device), 
+                                   max_new_tokens=max_new_tokens,
+                                   temperature=temperature, #value used to modulate the next token probabilities.
+                                   num_beams=num_beams,
+                                   top_k = top_k,
+                                   top_p =top_p,
+                                   num_return_sequences = num_return_sequences, eos_token_id=eos_token_id,
+                                   do_sample =True, 
+                                   repetition_penalty=repetition_penalty,
+                                  )
+    return tokenizer.batch_decode(outputs[:,inputs.shape[1]:].detach().cpu().numpy(), skip_special_tokens=True)
+
+```
+Then:
+```
+messages = [
+    {"role": "system", "content": "You are a friendly materials scientist."},
+    {"role": "user", "content": "What is the strongest spider silk material?"},
+]
+prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+output_text=generate_response (text_input=prompt, eos_token_id=eos_token,
+                                num_return_sequences=1, repetition_penalty=1.,
+                                top_p=0.9, top_k=512,  
+                                temperature=0.1,max_new_tokens=512, verbatim=False,
+                               )
+print (output_text)
+```
+
+#### GGUF files: Loading and inference
+
+```
+from llama_cpp import Llama
+
+model_path='./BioinspiredZephyr-7B/ggml-model-q5_K_M.gguf'
+chat_format="mistral-instruct"
+
+llm = Llama(model_path=model_path,
+            n_gpu_layers=-1,verbose= True, 
+            n_ctx=10000,
+            #main_gpu=0,
+            chat_format=chat_format,
+            #split_mode=llama_cpp.LLAMA_SPLIT_LAYER
+            )
+```
+
+Or, download directly from Hugging Face:
+
+```
+from llama_cpp import Llama
+
+model_path='lamm-mit/BioinspiredZephyr-7B/ggml-model-q5_K_M.gguf'
+chat_format="mistral-instruct"
+
+llm = Llama.from_pretrained(
+    repo_id=model_path,
+    filename="*q5_K_M.gguf",
+    verbose=True,
+    n_gpu_layers=-1, 
+    n_ctx=10000,
+    #main_gpu=0,
+    chat_format=chat_format,
+)
+```
+For inference:
+```
+def generate_BioinspiredZephyr_7B(system_prompt='You are an expert in biological materials, mechanics and related topics.',
+                                  prompt="What is spider silk?",
+                                  temperature=0.0,
+                                  max_tokens=10000,  
+                                  ):
+    if system_prompt==None:
+        messages=[
+            {"role": "user", "content": prompt},
+            ]
+    else:
+        messages=[
+            {"role": "system",  "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ]
+
+    result=llm.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+
+start_time = time.time()
+result=generate_BioinspiredZephyr_7B(system_prompt='You respond accurately.', 
+                        prompt="What is graphene? Answer with detail.",
+                        max_tokens=512, temperature=0.7,  )
+print (result)
+deltat=time.time() - start_time
+print("--- %s seconds ---" % deltat)
+toked=tokenizer(res)
+print ("Tokens per second (generation): ", len (toked['input_ids'])/deltat)
+```
+
+arXiv: https://arxiv.org/abs/2309.08788