commit 4b95bb587b60ae68fc74e2e58a19975d02f74f66 Author: ModelHub XC Date: Sat Apr 11 10:50:01 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Chat2DB/Chat2DB-SQL-7B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7bc225d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..99bb36c --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# Chat2DB-GLM + + +## 简介 + +Chat2DB-GLM是[Chat2DB](https://github.com/chat2db/Chat2DB/)开源项目的组成部分,旨在提供一个高效的途径,将自然语言查询转换为结构化的SQL语句。此次开源的[Chat2DB-SQL-7B](https://huggingface.co/Chat2DB/Chat2DB-SQL-7B)模型,拥有7B参数,基于CodeLlama进行了精心微调。这一模型专为自然语言转SQL任务设计,支持多种SQL方言,并且具有高达16k的上下文长度处理能力。 + +## 方言支持 + +Chat2DB-SQL-7B模型支持广泛的SQL方言,包括但不限于Mysql、Postgres、Sqlite,以及其他通用的SQL方言。这一跨方言支持能力确保了模型的广泛适用性和灵活性。 + +## 模型效果 + +Chat2DB-SQL-7B模型在多个方言和SQL关键部分上都展现出了优异的性能。以下是模型在不同的SQL关键部分的表现概览,以通用SQL为例,基于spider数据集进行的评测结果展示了模型在处理SQL各个关键部分和各类SQL函数(如日期函数、字符串函数等)上的能力。 + +| 方言 | select | where | group | order | function | total | +|:-----------|:------:|:-----:|:-----:|------:|:--------:|------:| +| Generic SQL | 91.5 | 83.7 | 80.5 | 98.2 | 96.2 | 77.3 | + +## 模型局限性与使用须知 + +Chat2DB-SQL-7B主要针对方言MySql、PostgreSQL和通用SQL进行了微调。尽管对于其他SQL方言,此模型仍可提供基本的转换能力,但在处理特定方言的特殊函数(如日期函数、字符串函数等)时,可能会出现误差。随着数据集的变化,模型的性能也可能会有所不同。 + +请注意,此模型主要供学术研究和学习目的使用。虽然我们努力确保模型输出的准确性,但不保证其在生产环境中的表现。使用此模型所产生的任何潜在损失,本项目及其贡献者概不负责。我们鼓励用户在使用模型时,应谨慎评估其在特定用例中的适用性。 + +## 模型推理 + +您可以通过transformers加载模型,参考如下样例代码段使用Chat2DB-SQL-7B模型,模型表现会随着prompt不同而有所不同,请尽量使用以下样例中的prompt范式。以下代码块中的model_path可以替换成你的本地模型路径。 + +```python +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline +model_path = "Chat2DB/Chat2DB-SQL-7B" # 此处可换成模型的本地路径 +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) +model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",trust_remote_code=True, torch_dtype=torch.float16,use_cache=True) +pipe = pipeline( "text-generation",model=model,tokenizer=tokenizer,return_full_text=False,max_new_tokens=100) +prompt = "### Database Schema\n\n['CREATE TABLE \"stadium\" (\\n\"Stadium_ID\" int,\\n\"Location\" text,\\n\"Name\" text,\\n\"Capacity\" int,\\n\"Highest\" int,\\n\"Lowest\" int,\\n\"Average\" int,\\nPRIMARY KEY (\"Stadium_ID\")\\n);', 'CREATE TABLE \"singer\" (\\n\"Singer_ID\" int,\\n\"Name\" text,\\n\"Country\" text,\\n\"Song_Name\" text,\\n\"Song_release_year\" text,\\n\"Age\" int,\\n\"Is_male\" bool,\\nPRIMARY KEY (\"Singer_ID\")\\n);', 'CREATE TABLE \"concert\" (\\n\"concert_ID\" int,\\n\"concert_Name\" text,\\n\"Theme\" text,\\n\"Stadium_ID\" text,\\n\"Year\" text,\\nPRIMARY KEY (\"concert_ID\"),\\nFOREIGN KEY (\"Stadium_ID\") REFERENCES \"stadium\"(\"Stadium_ID\")\\n);', 'CREATE TABLE \"singer_in_concert\" (\\n\"concert_ID\" int,\\n\"Singer_ID\" text,\\nPRIMARY KEY (\"concert_ID\",\"Singer_ID\"),\\nFOREIGN KEY (\"concert_ID\") REFERENCES \"concert\"(\"concert_ID\"),\\nFOREIGN KEY (\"Singer_ID\") REFERENCES \"singer\"(\"Singer_ID\")\\n);']\n\n\n### Task \n\n基于提供的database schema信息,How many singers do we have?[SQL]\n" +response = pipe(prompt)[0]["generated_text"] +print(response) +``` + +## 硬件要求 + +| 模型 | 最低GPU显存(推理) | 最低GPU显存(高效参数微调) | +|:--------------:|:-----------------:|:-------------------------:| +| Chat2DB-SQL-7B | 14GB | 20GB | + + + +## 贡献指南 +我们欢迎并鼓励社区成员对Chat2DB-GLM项目进行贡献。无论是通过报告问题、提出新功能,还是直接提交代码修复和改进,您的帮助都是非常宝贵的。 + +如果您有兴趣贡献,请遵循我们的贡献指南: + +报告问题:通过GitHub Issues报告遇到的任何问题或错误。 +提交拉取请求:如果您想直接为代码库贡献,请先fork仓库,然后提交拉取请求(PR)。 +改进文档:欢迎提供最佳实践、示例代码、文档改进等。 + + +## 许可证 +本项目中的模型权重受Code Llama的自定义商业许可证约束。详情请访问:[自定义商业许可证](https://llama.meta.com/llama-downloads/) + +在使用此软件之前,请确保您已充分理解许可证的条款。 diff --git a/config.json b/config.json new file mode 100644 index 0000000..b8e9527 --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "/root/models/cl1-mys-pg/", + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 16384, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.2", + "use_cache": true, + "vocab_size": 32016 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..7a03f6d --- /dev/null +++ b/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "transformers_version": "4.33.2" +} diff --git a/pytorch_model-00001-of-00002.bin b/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000..692e9a5 --- /dev/null +++ b/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f3bd3b5d0dfc4dd1c3b4e79fccf01d49d0d7f51792b2813b11fe40db14143e +size 9976751681 diff --git a/pytorch_model-00002-of-00002.bin b/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000..1ca60c9 --- /dev/null +++ b/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbd54d311b128dab2cc14408117c973a5b8fa339ad323ddeba5bde5edee72a8 +size 3500442334 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000..c026802 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca9b8f5a8f500eafd4bdcbd52898a75834f2f5d596630cdca4da3c30034b625 +size 23950 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..d85ba6c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000..f6722e8 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6 +size 500058 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..f6450f4 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,38 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "clean_up_tokenization_spaces": false, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": null, + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "use_default_system_prompt": true +}