bark_tokenize_input函数
- bark是没有语言控制选项的,但是官方的版本无法运行中文
- bark_tokenize_input会调用bert_tokenize函数,bark_tokenize_input函数对中文分词失效,也就是导致不支持中文的原因。
void bark_tokenize_input(struct bark_context * ctx, const char * text) {auto & model = ctx->model.text_model;bark_vocab * vocab = &ctx->model.vocab;int32_t block_size = model.hparams.block_size;int32_t max_ctx_size = std::min(block_size, 256);int32_t n_tokens;bark_sequence tokens(max_ctx_size);bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size);for (int i = 0; i < (int) tokens.size(); i++)tokens[i] += TEXT_ENCODING_OFFSET;if (n_tokens < max_ctx_size) {for (int i = n_tokens; i < max_ctx_size; i++)tokens[i] = TEXT_PAD_TOKEN;} else if (n_tokens > max_ctx_size) {fprintf(stderr, "%s: input sequence is too long (%d > 256), truncating sequence", __func__, n_tokens);}tokens.resize(max_ctx_size);// semantic historyfor (int i = 0; i < 256; i++)tokens.push_back(SEMANTIC_PAD_TOKEN);tokens.push_back(SEMANTIC_INFER_TOKEN);assert(tokens.size() == 256 + 256 + 1);ctx->tokens = tokens;printf("%s: prompt: '%s'\n", __func__, text);printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, ctx->tokens.size());for (int i = 0; i < std::min(8, (int) ctx->tokens.size()); i++) {printf("%d ", ctx->tokens[i]);}printf("\n");
}
单词表对象
- 对象创建自vocab.txt
上一步完成后还会进行wordpiece处理
// apply wordpiece
for (const auto &word : words) {// 如果单词长度为0,跳过if (word.size() == 0)continue;std::string prefix = ""; // 初始化前缀为空字符串int i = 0; // 初始化索引 i 为0int n = word.size(); // 获取单词长度loop:while (i < n) {// 如果 tokens 数组中的元素达到了最大允许值,跳出循环if (t >= n_max_tokens - 1)break;int j = n; // 初始化 j 为单词长度while (j > i) {// 尝试找到前缀加上从 i 到 j 的子串在 token_map 中的映射auto it = token_map->find(prefix + word.substr(i, j - i));if (it != token_map->end()) {// 找到映射,将映射的值添加到 tokens 数组中tokens[t++] = it->second;i = j; // 更新索引 iprefix = "##"; // 更新前缀为 "##"goto loop; // 跳转到 loop 标签处}--j; // 递减 j}// 如果 j 等于 i,说明无法找到合适的子串if (j == i) {fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());prefix = "##"; // 更新前缀为 "##"++i; // 更新索引 i}}}
}
bert_tokenize函数
- bert_tokenize函数会将句子tockenlize到ctx中的tokens对象。
- 代码实现如下
void bert_tokenize(const bark_vocab * vocab,const char * text,int32_t * tokens,int32_t * n_tokens,int32_t n_max_tokens) {std::string str = text;std::vector<std::string> words;int32_t t = 0;auto * token_map = &vocab->token_to_id;// split the text into words{str = strip_accents(text);std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";std::regex re(pat);std::smatch m;while (std::regex_search(str, m, re)) {for (std::string x : m)words.push_back(x);str = m.suffix();}}// apply wordpiecefor (const auto &word : words) {if (word.size() == 0)continue;std::string prefix = "";int i = 0;int n = word.size();loop:while (i < n) {if (t >= n_max_tokens - 1)break;int j = n;while (j > i) {auto it = token_map->find(prefix + word.substr(i, j - i));if (it != token_map->end()) {tokens[t++] = it->second;i = j;prefix = "##";goto loop;}--j;}if (j == i) {fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());prefix = "##";++i;}}}*n_tokens = t;
}
将文本分割成单词
- 将文本分割成单词的部分使用了如下的正则表达式,其无法支持中文句子的分割,这也导致了无法正确推理运行。
// split the text into words 将文本分割成单词{// 对文本进行去重音符处理str = strip_accents(text);// 定义正则表达式模式,匹配标点符号、字母和数字std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";// 创建正则表达式对象std::regex re(pat);std::smatch m;// 使用正则表达式匹配文本中的单词while (std::regex_search(str, m, re)) {// 将匹配到的单词添加到单词列表for (std::string x : m)words.push_back(x);// 更新文本,排除已匹配的部分str = m.suffix();}}
简单修改与运行
// examples/main.cpp
struct bark_params {int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());// user promptstd::string prompt = "你 好"; // "this is an audio";//std::string prompt = "this is an audio"; // "this is an audio";// pathsstd::string model_path = "./bark.cpp/ggml_weights";std::string dest_wav_path = "output.wav";int32_t seed = 0;
};
// bark.cpp// split the text into words{str = strip_accents(text);// std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";
//
// std::regex re(pat);
// std::smatch m;
//
// while (std::regex_search(str, m, re)) {
// for (std::string x : m)
// words.push_back(x);
// str = m.suffix();
// }// 用空格分割字符串std::istringstream iss(str);std::vector<std::string> words;// 从输入流中读取每个分词并添加到 vector 中do {std::string word;iss >> word;words.push_back(word);} while (iss);// 输出分词结果std::cout << "分词结果:" << std::endl;for (const auto& word : words) {std::cout << word << std::endl;}}