This repository has been archived on 2025-08-26. You can view files and clone it, but cannot push or open issues or pull requests.
Files
enginex-mr_series-sherpa-onnx/sherpa-onnx/csrc/cppjieba-test.cc

145 lines
5.0 KiB
C++
Raw Normal View History

// sherpa-onnx/csrc/cppjieba-test.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include <iostream>
#include <regex> // NOLINT
#include <string>
#include <vector>
#include "cppjieba/Jieba.hpp"
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
// Please download dict files form
// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
const char *const kDictPath = "./dict/jieba.dict.utf8";
const char *const kHmmPath = "./dict/hmm_model.utf8";
const char *const kUserDictPath = "./dict/user.dict.utf8";
const char *const kIdfPath = "./dict/idf.utf8";
const char *const kStopWordPath = "./dict/stop_words.utf8";
TEST(CppJieBa, Case1) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);
std::vector<std::string> words;
std::vector<cppjieba::Word> jiebawords;
2024-07-16 15:55:02 +08:00
std::string s = "他来到了网易杭研大厦。How are you?";
std::cout << s << std::endl;
std::cout << "[demo] Cut With HMM" << std::endl;
jieba.Cut(s, words, true);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
[demo] Cut With HMM
/////
*/
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
std::cout << s << std::endl;
std::cout << "[demo] CutForSearch" << std::endl;
jieba.CutForSearch(s, words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
[demo] CutForSearch
//////////////////
*/
std::cout << "[demo] Insert User Word" << std::endl;
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
jieba.InsertUserWord("男默女泪");
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
[demo] Insert User Word
/
*/
std::cout << "[demo] CutForSearch Word With Offset" << std::endl;
jieba.CutForSearch(s, jiebawords, true);
std::cout << jiebawords << std::endl;
/*
[demo] CutForSearch Word With Offset
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业",
"offset": 12}, {"word": "", "offset": 18}, {"word": "中国", "offset": 21},
{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word":
"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算",
"offset": 36}, {"word": "计算所", "offset": 36}, {"word": "", "offset": 45},
{"word": "", "offset": 48}, {"word": "", "offset": 51}, {"word": "日本",
"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66},
{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}]
*/
// see more test at
// https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp
}
TEST(CppJieBa, Case2) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);
std::string s =
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
"涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
"受着生命的奇迹与温柔";
std::vector<std::string> words;
bool is_hmm = true;
jieba.Cut(s, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
std::cout << os.str() << "\n";
}
/*
_夜幕降临__星光点点__伴随_着_微风_拂面_
_我_在_静谧_中_感受_着_时光_的_流转_
_思念_如_涟漪_荡漾__梦境_如_画卷_展开__我_与_自然_融为一体_
_沉静_在_这_片_宁静_的_美丽_之中__感受_着_生命_的_奇迹_与_温柔
*/
s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试.";
std::regex punct_re("|、|");
std::string s2 = std::regex_replace(s, punct_re, "");
std::regex punct_re2("[.]");
s2 = std::regex_replace(s2, punct_re2, "");
std::regex punct_re3("[?]");
s2 = std::regex_replace(s2, punct_re3, "");
std::regex punct_re4("[!]");
s2 = std::regex_replace(s2, punct_re4, "");
std::cout << s << "\n" << s2 << "\n";
words.clear();
jieba.Cut(s2, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
std::cout << os.str() << "\n";
}
}
} // namespace sherpa_onnx