Fix splitting utf8 string into words (#385)
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||||
project(sherpa-onnx)
|
project(sherpa-onnx)
|
||||||
|
|
||||||
set(SHERPA_ONNX_VERSION "1.8.4")
|
set(SHERPA_ONNX_VERSION "1.8.5")
|
||||||
|
|
||||||
# Disable warning about
|
# Disable warning about
|
||||||
#
|
#
|
||||||
@@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET)
|
|||||||
include(asio)
|
include(asio)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include(utfcpp)
|
|
||||||
|
|
||||||
add_subdirectory(sherpa-onnx)
|
add_subdirectory(sherpa-onnx)
|
||||||
|
|
||||||
if(SHERPA_ONNX_ENABLE_C_API)
|
if(SHERPA_ONNX_ENABLE_C_API)
|
||||||
|
|||||||
@@ -1,45 +0,0 @@
|
|||||||
function(download_utfcpp)
|
|
||||||
include(FetchContent)
|
|
||||||
|
|
||||||
set(utfcpp_URL "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz")
|
|
||||||
set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz")
|
|
||||||
set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd")
|
|
||||||
|
|
||||||
# If you don't have access to the Internet,
|
|
||||||
# please pre-download utfcpp
|
|
||||||
set(possible_file_locations
|
|
||||||
$ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz
|
|
||||||
${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz
|
|
||||||
${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz
|
|
||||||
/tmp/utfcpp-3.2.5.tar.gz
|
|
||||||
/star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz
|
|
||||||
)
|
|
||||||
|
|
||||||
foreach(f IN LISTS possible_file_locations)
|
|
||||||
if(EXISTS ${f})
|
|
||||||
set(utfcpp_URL "${f}")
|
|
||||||
file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL)
|
|
||||||
message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}")
|
|
||||||
set(utfcpp_URL2)
|
|
||||||
break()
|
|
||||||
endif()
|
|
||||||
endforeach()
|
|
||||||
|
|
||||||
FetchContent_Declare(utfcpp
|
|
||||||
URL
|
|
||||||
${utfcpp_URL}
|
|
||||||
${utfcpp_URL2}
|
|
||||||
URL_HASH ${utfcpp_HASH}
|
|
||||||
)
|
|
||||||
|
|
||||||
FetchContent_GetProperties(utfcpp)
|
|
||||||
if(NOT utfcpp_POPULATED)
|
|
||||||
message(STATUS "Downloading utfcpp from ${utfcpp_URL}")
|
|
||||||
FetchContent_Populate(utfcpp)
|
|
||||||
endif()
|
|
||||||
message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}")
|
|
||||||
# add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL)
|
|
||||||
include_directories(${utfcpp_SOURCE_DIR})
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
download_utfcpp()
|
|
||||||
@@ -16,7 +16,7 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "source/utf8.h"
|
#include "sherpa-onnx/csrc/macros.h"
|
||||||
|
|
||||||
// This file is copied/modified from
|
// This file is copied/modified from
|
||||||
// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
|
// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
|
||||||
@@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
|
|||||||
std::vector<double> *out);
|
std::vector<double> *out);
|
||||||
|
|
||||||
std::vector<std::string> SplitUtf8(const std::string &text) {
|
std::vector<std::string> SplitUtf8(const std::string &text) {
|
||||||
char *begin = const_cast<char *>(text.c_str());
|
const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
|
||||||
char *end = begin + text.size();
|
const uint8_t *end = begin + text.size();
|
||||||
|
|
||||||
std::vector<std::string> ans;
|
std::vector<std::string> ans;
|
||||||
std::string buf;
|
|
||||||
|
|
||||||
while (begin < end) {
|
auto start = begin;
|
||||||
uint32_t code = utf8::next(begin, end);
|
while (start < end) {
|
||||||
|
uint8_t c = *start;
|
||||||
|
uint8_t i = 0x80;
|
||||||
|
int32_t num_bytes = 0;
|
||||||
|
|
||||||
// 1. is punctuation
|
// see
|
||||||
if (std::ispunct(code)) {
|
// https://en.wikipedia.org/wiki/UTF-8
|
||||||
if (!buf.empty()) {
|
for (; c & i; i >>= 1) {
|
||||||
ans.push_back(std::move(buf));
|
++num_bytes;
|
||||||
}
|
|
||||||
|
|
||||||
char s[5] = {0};
|
|
||||||
utf8::append(code, s);
|
|
||||||
ans.push_back(s);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. is space
|
if (num_bytes == 0) {
|
||||||
if (std::isspace(code)) {
|
// this is an ascii
|
||||||
if (!buf.empty()) {
|
ans.emplace_back(reinterpret_cast<const char *>(start), 1);
|
||||||
ans.push_back(std::move(buf));
|
++start;
|
||||||
}
|
} else if (2 <= num_bytes && num_bytes <= 4) {
|
||||||
continue;
|
ans.emplace_back(reinterpret_cast<const char *>(start), num_bytes);
|
||||||
|
start += num_bytes;
|
||||||
|
} else {
|
||||||
|
SHERPA_ONNX_LOGE("Invalid byte at position: %d",
|
||||||
|
static_cast<int32_t>(start - begin));
|
||||||
|
// skip this byte
|
||||||
|
++start;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. is alpha
|
|
||||||
if (std::isalpha(code)) {
|
|
||||||
buf.push_back(code);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!buf.empty()) {
|
|
||||||
ans.push_back(std::move(buf));
|
|
||||||
}
|
|
||||||
|
|
||||||
// for others
|
|
||||||
|
|
||||||
char s[5] = {0};
|
|
||||||
utf8::append(code, s);
|
|
||||||
ans.push_back(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!buf.empty()) {
|
|
||||||
ans.push_back(std::move(buf));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
Reference in New Issue
Block a user