commit 1474267557ad50acef5a6321458f000c288686d1 Author: ModelHub XC Date: Wed May 13 15:53:24 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Sahajtomar/french_semantic Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..07f0db3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,16 @@ +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text diff --git a/0_Transformer/config.json b/0_Transformer/config.json new file mode 100644 index 0000000..f3055d6 --- /dev/null +++ b/0_Transformer/config.json @@ -0,0 +1,27 @@ +{ + "_name_or_path": "camembert/camembert-large", + "architectures": [ + "CamembertModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "camembert", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "transformers_version": "4.5.0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32005 +} diff --git a/0_Transformer/pytorch_model.bin b/0_Transformer/pytorch_model.bin new file mode 100644 index 0000000..0525170 --- /dev/null +++ b/0_Transformer/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88aa459c9739a9cf47eae06afbfd6591b4897c587531d1c6b6dd305c2e4a9fd8 +size 1346812279 diff --git a/0_Transformer/sentence_bert_config.json b/0_Transformer/sentence_bert_config.json new file mode 100644 index 0000000..826687b --- /dev/null +++ b/0_Transformer/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": null, + "do_lower_case": false +} \ No newline at end of file diff --git a/0_Transformer/sentencepiece.bpe.model b/0_Transformer/sentencepiece.bpe.model new file mode 100644 index 0000000..b7246b0 --- /dev/null +++ b/0_Transformer/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98f266fdc548c94216aaadc13ffaaafacf0c8793303e2195322d954549ea261 +size 808767 diff --git a/0_Transformer/special_tokens_map.json b/0_Transformer/special_tokens_map.json new file mode 100644 index 0000000..f063608 --- /dev/null +++ b/0_Transformer/special_tokens_map.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "unk_token": "", "sep_token": "", "pad_token": "", "cls_token": "", "mask_token": {"content": "", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["NOTUSED", "NOTUSED"]} \ No newline at end of file diff --git a/0_Transformer/tokenizer_config.json b/0_Transformer/tokenizer_config.json new file mode 100644 index 0000000..1ab44bc --- /dev/null +++ b/0_Transformer/tokenizer_config.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "sep_token": "", "cls_token": "", "unk_token": "", "pad_token": "", "mask_token": {"content": "", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "additional_special_tokens": ["NOTUSED", "NOTUSED"], "special_tokens_map_file": null, "name_or_path": "camembert/camembert-large"} \ No newline at end of file diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..c95142e --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,7 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a662d8a --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +--- +language: fr +tags: +- semantic +- sentence-transformers +- sentence-similarity +- fr +datasets: +- sts +--- +# French STS +## STS dev (french) +87.4% +## STS test (french) +85.8% +#### STS pipeline +```python +!pip install -U sentence-transformers +from sentence_transformers import SentenceTransformer +model = SentenceTransformer('..model_path..') +sentences1 = ["J'aime mon téléphone", + "Mon téléphone n'est pas bon.", + "Votre téléphone portable est superbe."] + +sentences2 = ["Est-ce qu'il neige demain?", + "Récemment, de nombreux ouragans ont frappé les États-Unis", + "Le réchauffement climatique est réel",] +embeddings1 = model.encode(sentences1, convert_to_tensor=True) +embeddings2 = model.encode(sentences2, convert_to_tensor=True) +cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2) +for i in range(len(sentences1)): + for j in range(len(sentences2)): + print(cosine_scores[i][j])) +""" +""" +``` \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e903939 --- /dev/null +++ b/config.json @@ -0,0 +1,3 @@ +{ + "__version__": "1.0.4" +} \ No newline at end of file diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..f223a42 --- /dev/null +++ b/modules.json @@ -0,0 +1,14 @@ +[ + { + "idx": 0, + "name": "0", + "path": "0_Transformer", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + } +] \ No newline at end of file diff --git a/similarity_evaluation_sts-dev_results.csv b/similarity_evaluation_sts-dev_results.csv new file mode 100644 index 0000000..9f7bb7a --- /dev/null +++ b/similarity_evaluation_sts-dev_results.csv @@ -0,0 +1,5 @@ +epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman +0,-1,0.8637469125056083,0.8632614651072978,0.8453936664624152,0.8492903002476015,0.8449168387030057,0.8486647152951936,0.8209821074199751,0.8237147244632509 +1,-1,0.8678614404426834,0.8667458959821454,0.8547963868083709,0.8555799201285157,0.8542416151454125,0.8549764158923009,0.8380883904484107,0.8348539185207023 +2,-1,0.8740840204620244,0.8728015900675534,0.8611634365655535,0.8629708834338609,0.8609967994360048,0.8627099269702087,0.8483860456839277,0.8455799379123732 +3,-1,0.8744578309597704,0.8730785971619869,0.8624775706547827,0.8644809663305459,0.8623524115539193,0.864286517918685,0.8514432955875211,0.8483253736946224 diff --git a/similarity_evaluation_sts-test_results.csv b/similarity_evaluation_sts-test_results.csv new file mode 100644 index 0000000..84bbe09 --- /dev/null +++ b/similarity_evaluation_sts-test_results.csv @@ -0,0 +1,2 @@ +epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman +-1,-1,0.85801451596893,0.8554828594121078,0.848554409521124,0.853157512282925,0.8485465810034418,0.8531020401425077,0.8291477964682629,0.8225415231180051