commit 5df0a328f0c353fb46a2d575997bb2203859063c Author: ModelHub XC Date: Thu May 28 03:58:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Youmnaaaa/Semantic-model Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..859b01c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +model/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..5bd0b52 --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 384, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6bedb7f --- /dev/null +++ b/README.md @@ -0,0 +1,156 @@ +--- +language: +- multilingual +- ar +- bg +- ca +- cs +- da +- de +- el +- en +- es +- et +- fa +- fi +- fr +- gl +- gu +- he +- hi +- hr +- hu +- hy +- id +- it +- ja +- ka +- ko +- ku +- lt +- lv +- mk +- mn +- mr +- ms +- my +- nb +- nl +- pl +- pt +- ro +- ru +- sk +- sl +- sq +- sr +- sv +- th +- tr +- uk +- ur +- vi +license: apache-2.0 +library_name: sentence-transformers +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity +- transformers +language_bcp47: +- fr-ca +- pt-br +- zh-cn +- zh-tw +pipeline_tag: sentence-similarity +--- + +# sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + +This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. + + + +## Usage (Sentence-Transformers) + +Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: + +``` +pip install -U sentence-transformers +``` + +Then you can use the model like this: + +```python +from sentence_transformers import SentenceTransformer +sentences = ["This is an example sentence", "Each sentence is converted"] + +model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') +embeddings = model.encode(sentences) +print(embeddings) +``` + + + +## Usage (HuggingFace Transformers) +Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. + +```python +from transformers import AutoTokenizer, AutoModel +import torch + + +# Mean Pooling - Take attention mask into account for correct averaging +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + + +# Sentences we want sentence embeddings for +sentences = ['This is an example sentence', 'Each sentence is converted'] + +# Load model from HuggingFace Hub +tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') +model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') + +# Tokenize sentences +encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') + +# Compute token embeddings +with torch.no_grad(): + model_output = model(**encoded_input) + +# Perform pooling. In this case, max pooling. +sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) + +print("Sentence embeddings:") +print(sentence_embeddings) +``` + + + +## Full Model Architecture +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel + (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) +) +``` + +## Citing & Authors + +This model was trained by [sentence-transformers](https://www.sbert.net/). + +If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084): +```bibtex +@inproceedings{reimers-2019-sentence-bert, + title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", + author = "Reimers, Nils and Gurevych, Iryna", + booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", + month = "11", + year = "2019", + publisher = "Association for Computational Linguistics", + url = "http://arxiv.org/abs/1908.10084", +} +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..cca09ef --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "add_cross_attention": false, + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": null, + "classifier_dropout": null, + "dtype": "float32", + "eos_token_id": null, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 384, + "initializer_range": 0.02, + "intermediate_size": 1536, + "is_decoder": false, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 250037 +} diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json new file mode 100644 index 0000000..4f21ef5 --- /dev/null +++ b/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.3.0", + "transformers": "5.0.0", + "pytorch": "2.10.0+cpu" + }, + "model_type": "SentenceTransformer", + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..0b7d1a1 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40576ad50be15cc77304f9f1404ba8e56aab722790ac98e01b0b03838e3639c4 +size 470637392 diff --git a/model/1_Pooling/config.json b/model/1_Pooling/config.json new file mode 100644 index 0000000..94d474b --- /dev/null +++ b/model/1_Pooling/config.json @@ -0,0 +1,5 @@ +{ + "embedding_dimension": 384, + "pooling_mode": "mean", + "include_prompt": true +} \ No newline at end of file diff --git a/model/README.md b/model/README.md new file mode 100644 index 0000000..6bedb7f --- /dev/null +++ b/model/README.md @@ -0,0 +1,156 @@ +--- +language: +- multilingual +- ar +- bg +- ca +- cs +- da +- de +- el +- en +- es +- et +- fa +- fi +- fr +- gl +- gu +- he +- hi +- hr +- hu +- hy +- id +- it +- ja +- ka +- ko +- ku +- lt +- lv +- mk +- mn +- mr +- ms +- my +- nb +- nl +- pl +- pt +- ro +- ru +- sk +- sl +- sq +- sr +- sv +- th +- tr +- uk +- ur +- vi +license: apache-2.0 +library_name: sentence-transformers +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity +- transformers +language_bcp47: +- fr-ca +- pt-br +- zh-cn +- zh-tw +pipeline_tag: sentence-similarity +--- + +# sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + +This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. + + + +## Usage (Sentence-Transformers) + +Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: + +``` +pip install -U sentence-transformers +``` + +Then you can use the model like this: + +```python +from sentence_transformers import SentenceTransformer +sentences = ["This is an example sentence", "Each sentence is converted"] + +model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') +embeddings = model.encode(sentences) +print(embeddings) +``` + + + +## Usage (HuggingFace Transformers) +Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. + +```python +from transformers import AutoTokenizer, AutoModel +import torch + + +# Mean Pooling - Take attention mask into account for correct averaging +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + + +# Sentences we want sentence embeddings for +sentences = ['This is an example sentence', 'Each sentence is converted'] + +# Load model from HuggingFace Hub +tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') +model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') + +# Tokenize sentences +encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') + +# Compute token embeddings +with torch.no_grad(): + model_output = model(**encoded_input) + +# Perform pooling. In this case, max pooling. +sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) + +print("Sentence embeddings:") +print(sentence_embeddings) +``` + + + +## Full Model Architecture +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel + (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) +) +``` + +## Citing & Authors + +This model was trained by [sentence-transformers](https://www.sbert.net/). + +If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084): +```bibtex +@inproceedings{reimers-2019-sentence-bert, + title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", + author = "Reimers, Nils and Gurevych, Iryna", + booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", + month = "11", + year = "2019", + publisher = "Association for Computational Linguistics", + url = "http://arxiv.org/abs/1908.10084", +} +``` \ No newline at end of file diff --git a/model/config.json b/model/config.json new file mode 100644 index 0000000..cca09ef --- /dev/null +++ b/model/config.json @@ -0,0 +1,30 @@ +{ + "add_cross_attention": false, + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": null, + "classifier_dropout": null, + "dtype": "float32", + "eos_token_id": null, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 384, + "initializer_range": 0.02, + "intermediate_size": 1536, + "is_decoder": false, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embedding_type": "absolute", + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 250037 +} diff --git a/model/config_sentence_transformers.json b/model/config_sentence_transformers.json new file mode 100644 index 0000000..77b94b3 --- /dev/null +++ b/model/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "pytorch": "2.10.0+cpu", + "sentence_transformers": "5.4.1", + "transformers": "5.0.0" + }, + "default_prompt_name": null, + "model_type": "SentenceTransformer", + "prompts": { + "document": "", + "query": "" + }, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/model/model.safetensors b/model/model.safetensors new file mode 100644 index 0000000..0b7d1a1 --- /dev/null +++ b/model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40576ad50be15cc77304f9f1404ba8e56aab722790ac98e01b0b03838e3639c4 +size 470637392 diff --git a/model/modules.json b/model/modules.json new file mode 100644 index 0000000..45d2436 --- /dev/null +++ b/model/modules.json @@ -0,0 +1,14 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.base.modules.transformer.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling" + } +] \ No newline at end of file diff --git a/model/sentence_bert_config.json b/model/sentence_bert_config.json new file mode 100644 index 0000000..d2cd158 --- /dev/null +++ b/model/sentence_bert_config.json @@ -0,0 +1,10 @@ +{ + "transformer_task": "feature-extraction", + "modality_config": { + "text": { + "method": "forward", + "method_output_name": "last_hidden_state" + } + }, + "module_output_name": "token_embeddings" +} \ No newline at end of file diff --git a/model/tokenizer.json b/model/tokenizer.json new file mode 100644 index 0000000..e342094 --- /dev/null +++ b/model/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719 +size 17082987 diff --git a/model/tokenizer_config.json b/model/tokenizer_config.json new file mode 100644 index 0000000..9ef1360 --- /dev/null +++ b/model/tokenizer_config.json @@ -0,0 +1,23 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "cls_token": "", + "do_lower_case": true, + "eos_token": "", + "is_local": false, + "mask_token": "", + "max_length": 128, + "model_max_length": 128, + "pad_to_multiple_of": null, + "pad_token": "", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "", + "stride": 0, + "strip_accents": null, + "tokenize_chinese_chars": true, + "tokenizer_class": "TokenizersBackend", + "truncation_side": "right", + "truncation_strategy": "longest_first", + "unk_token": "" +} diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..f7640f9 --- /dev/null +++ b/modules.json @@ -0,0 +1,14 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + } +] \ No newline at end of file diff --git a/places_clean.csv b/places_clean.csv new file mode 100644 index 0000000..6174279 --- /dev/null +++ b/places_clean.csv @@ -0,0 +1,10 @@ +place_id,name,category,rating,review_count,lat,lon,sub_category,location,address,price_range,opening_hours,description,phone,name_clean,category_clean,sub_category_clean,location_clean,address_clean,price_range_clean,search_text_clean,interaction_count,popularity_score +29,sultan elsham,Restaurant,3.0,1,29.081734,31.098136,,,,,,,,sultan elsham,restaurant,,,,,sultan elsham restaurant,59.0,0.686046511627907 +28,sultan elsham,Restaurant,0.0,0,29.047673,31.117448,,,,,,,,sultan elsham,restaurant,,,,,sultan elsham restaurant,35.0,0.4069767441860465 +26,صيدلية د.عبدالله محمد,Pharmacy,0.0,0,29.066206,31.086601,,,,,,,,صيدليه د عبدالله محمد,pharmacy,,,,,صيدليه د عبدالله محمد pharmacy,0.0,0.0 +24,sultan elsham,Restaurant,2.5,2,29.044374,31.1223,,,,,,,,sultan elsham,restaurant,,,,,sultan elsham restaurant,86.0,1.0 +32,Bolivar,Restaurant & Café,0.0,0,29.067624,31.110061,,,,,,,,bolivar,restaurant café,,,,,bolivar restaurant café,17.0,0.19767441860465115 +33,Antika,Cafe,0.0,0,29.078952,31.112817,,,,,,,,antika,cafe,,,,,antika cafe,20.0,0.23255813953488372 +15,On Public,Restaurant,0.0,0,29.062997,31.100496,,,,,,,,on public,restaurant,,,,,on public restaurant,14.0,0.16279069767441862 +30,صيدلية د.عبدالله محمد,Pharmacy,2.0,1,29.070685,31.098249,,,,,,,,صيدليه د عبدالله محمد,pharmacy,,,,,صيدليه د عبدالله محمد pharmacy,21.0,0.2441860465116279 +31,هايبر عباد الرحمن,supermarket,0.0,0,29.024021,31.101646,,,,,,,,هايبر عباد الرحمن,supermarket,,,,,هايبر عباد الرحمن supermarket,23.0,0.26744186046511625 diff --git a/semantic_data.pkl b/semantic_data.pkl new file mode 100644 index 0000000..16094ce --- /dev/null +++ b/semantic_data.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a51e6b74d082bdc05eb0776b2b6ee4d6ee422d41f2b13213a673957ee1f923be +size 32659 diff --git a/sentence_bert_config.json b/sentence_bert_config.json new file mode 100644 index 0000000..c0e22f7 --- /dev/null +++ b/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 128, + "do_lower_case": false +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..e342094 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719 +size 17082987 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..4669596 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,17 @@ +{ + "bos_token": "", + "cls_token": "", + "do_lower_case": true, + "eos_token": "", + "mask_token": "", + "model_max_length": 128, + "pad_token": "", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "", + "strip_accents": null, + "tokenize_chinese_chars": true, + "tokenizer_class": "XLMRobertaTokenizer", + "truncation_side": "right", + "unk_token": "" +} \ No newline at end of file