diff --git a/README.md b/README.md new file mode 100644 index 0000000..349c136 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# CosmsoLLaMa GGUFs + +## Objective +Due to the need for quantized models in real-time applications, we introduce our GGUF formatted models. These models are part of +GGML project with a hope to democratize the use of Large Models. Depending on the quantization type, there are 20+ models. + +### Features +* All quantization details are listed on the right by Hugging Face. +* All the models have been tested in `llama.cpp` environments, `llama-cli` and `llama-server`. +* Furthermore, a YouTube video has been made to introduce the basics of using `lmstudio` to utilize these models. + + + +### Code Example +Usage example with `llama-cpp-python` + +```py +from llama_cpp import Llama + +# Define the inference parameters +inference_params = { + "n_threads": 4, + "n_predict": -1, + "top_k": 40, + "min_p": 0.05, + "top_p": 0.95, + "temp": 0.8, + "repeat_penalty": 1.1, + "input_prefix": "<|start_header_id|>user<|end_header_id|>\\n\\n", + "input_suffix": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n", + "antiprompt": [], + "pre_prompt": "Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak.", + "pre_prompt_suffix": "<|eot_id|>", + "pre_prompt_prefix": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n", + "seed": -1, + "tfs_z": 1, + "typical_p": 1, + "repeat_last_n": 64, + "frequency_penalty": 0, + "presence_penalty": 0, + "n_keep": 0, + "logit_bias": {}, + "mirostat": 0, + "mirostat_tau": 5, + "mirostat_eta": 0.1, + "memory_f16": True, + "multiline_input": False, + "penalize_nl": True +} + +# Initialize the Llama model with the specified inference parameters +llama = Llama.from_pretrained( + repo_id="ytu-ce-cosmos/Turkish-Llama-8b-Instruct-v0.1-GGUF", + filename="*Q4_K.gguf", + verbose=False +) +# Example input +user_input = "Türkiyenin başkenti neresidir?" + +# Construct the prompt +prompt = f"{inference_params['pre_prompt_prefix']}{inference_params['pre_prompt']}\n\n{inference_params['input_prefix']}{user_input}{inference_params['input_suffix']}" + +# Generate the response +response = llama(prompt) + +# Output the response +print(response['choices'][0]['text']) + +``` + +The quantization has been made using `llama.cpp`. As we have seen, this method tends to give the most stable results. + +Obviously, we encountered better inference quality for models with the highest bits. However, the inference time tends to be similar between low-bit models. + +Each model's memory footprint can be anticipated by the qunatization docs in either [Hugging Face](https://huggingface.co/docs/transformers/main/en/quantization/overview) or [llama.cpp](https://github.com/ggerganov/llama.cpp/tree/master/examples/quantize). + +## Contact +*Feel free to contact us whenever you confront any problems :)* + +COSMOS AI Research Group, Yildiz Technical University Computer Engineering Department +https://cosmos.yildiz.edu.tr/ +cosmos@yildiz.edu.tr + + +