WordMap类从分词库中读入分词
将分词存入unordered_map<std::string, int> 中
#pragma once #include<istream> #include<unordered_map> #include<string> #include<ctime> class WordMap { public:WordMap(const std::string& filename);~WordMap();bool init();std::unordered_map<std::string, int> m_map;std::string m_filename;private:time_t difftime;std::string timestr( tm*); };
#include"wordmap.h" #include<fstream> #include<iostream> #include<sstream> #include<ctime> WordMap::WordMap(const std::string& filename):m_filename(filename), difftime(5) { }WordMap::~WordMap() {}bool WordMap::init() {std::ifstream input(m_filename);std::istringstream inputstring;time_t last;time(&last);time_t cur;if (input.is_open()){std::string inputs;tm nowtime; localtime_s(&nowtime,&last);std::cout << "开始初始化分词库,当前时间" << timestr(&nowtime)<<std::endl;while (std::getline(input, inputs)){time(&cur);std::istringstream inputstring(inputs);int num;int num2;std::string word;inputstring >> num;inputstring >> word;inputstring >> num2;m_map[word] = num2;if (cur - last > difftime){std::cout << "已初始化分词个数:" << m_map.size() << std::endl;last = cur;}}time(&cur);localtime_s(&nowtime, &cur);std::cout << "结束初始化分词库,当前时间" << timestr(&nowtime) << std::endl;}else{std::cerr << "can't not open file:" << m_filename;return false;}return true; }std::string WordMap::timestr(tm* nowtime) {std::ostringstream out;out << nowtime->tm_hour << ":" << nowtime->tm_min << ":" << nowtime->tm_sec;return std::move(out.str()); }
从文本中读入,对文本进行分词,分词方法详见
http://yangshangchuan.iteye.com/blog/2031813
以下是实现
#pragma once #include<string> using std::string; #include<vector> using std::vector; #include"wordmap.h" class FindWord { public:FindWord() {};~FindWord() {};vector<string> GetKeyWords(const string& filename,const WordMap& wordmap); private:int wsize = 5;bool ischinese(const char* c); public:int getlocalfindstring(const string& ostring, int begpos); };
@ -0,0 +1,71 @@ #include "findword.h" #include<fstream> #include<sstream> #include<iostream> using std::ifstream; using std::istringstream; vector<string> FindWord::GetKeyWords(const string & filename, const WordMap& wordmap) {vector<string> l_keyword;ifstream inputfile(filename);if (!inputfile.is_open()){std::cerr << "cann't not open file:" << filename;return l_keyword;}string sinput;string last;while (std::getline(inputfile, sinput)){last = sinput;int begpos = 0;int length;while ((length = getlocalfindstring(last, begpos)) != 0){int movelen = ischinese(&last[begpos]) ? 2:1;int findlen = -1;while (movelen<=length){string ls = last.substr(begpos, movelen);auto res = wordmap.m_map.find(ls);if (res != wordmap.m_map.end()){findlen = movelen;}movelen += ischinese(&last[begpos + movelen]) ? 2 : 1;}if (findlen != -1){l_keyword.push_back(last.substr(begpos, findlen));begpos = begpos + findlen;}else {begpos += length;}}}return l_keyword; }bool FindWord::ischinese(const char* c) {unsigned char cur = *c;unsigned char next = *(c + 1);if (next == 0)return false;return (cur >= 0xB0 && cur <= 0xF7) && (next >= 0xA1 && next <= 0xFE); }int FindWord::getlocalfindstring(const string& ostring,int begpos) {int size = wsize;int endpos = begpos;while (size > 0 && ostring[endpos]){if (ischinese(&ostring[endpos])) {endpos++;}size--;endpos++;}return endpos-begpos; }
样例程序
@ -0,0 +1,16 @@ #include"wordmap.h" #include<iostream> #include<string> #include"findword.h" using std::string; int main() {WordMap m_wordmap("../../../word/word1.txt");FindWord m_findword;if (!m_wordmap.init()) { return 0; };vector<string> res= m_findword.GetKeyWords("../../../inputfile/1999.txt", m_wordmap);for (auto elems : res)std::cout << elems << " ";return 0; }
github:https://github.com/wuzhuorui/kjct.git