Support printing input text and words after splitting (#376)
This commit is contained in:
@@ -76,7 +76,9 @@ static std::vector<int32_t> ConvertTokensToIds(
|
||||
}
|
||||
|
||||
Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
|
||||
const std::string &punctuations, const std::string &language) {
|
||||
const std::string &punctuations, const std::string &language,
|
||||
bool debug /*= false*/)
|
||||
: debug_(debug) {
|
||||
InitLanguage(language);
|
||||
InitTokens(tokens);
|
||||
InitLexicon(lexicon);
|
||||
@@ -102,6 +104,20 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
|
||||
const std::string &text) const {
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text in string: %s\n", text.c_str());
|
||||
fprintf(stderr, "Input text in bytes:");
|
||||
for (uint8_t c : text) {
|
||||
fprintf(stderr, " %02x", c);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "After splitting to words:");
|
||||
for (const auto &w : words) {
|
||||
fprintf(stderr, " %s", w.c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
std::vector<int64_t> ans;
|
||||
|
||||
auto sil = token2id_.at("sil");
|
||||
@@ -134,6 +150,21 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
|
||||
ToLowerCase(&text);
|
||||
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str());
|
||||
fprintf(stderr, "Input text in bytes:");
|
||||
for (uint8_t c : text) {
|
||||
fprintf(stderr, " %02x", c);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "After splitting to words:");
|
||||
for (const auto &w : words) {
|
||||
fprintf(stderr, " %s", w.c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int32_t blank = token2id_.at(" ");
|
||||
|
||||
std::vector<int64_t> ans;
|
||||
|
||||
Reference in New Issue
Block a user