题意
You are given a string, s, and a list of words, words, that are all of the same length. Find all starting indices of substring(s) in s that is a concatenation of each word in words exactly once and without any intervening characters.
For example, given:
s: "barfoothefoobarman"
words: ["foo", "bar"]
You should return the indices: [0,9].
(order does not matter).
Subscribe to see which companies asked this question
大概来说,就是给定一串字符串和单词数组,找到字符串中,也就是子串必须全部包含单词数组中的单词,要求必须连续,顺序可以不要求,其中单词数组中的单词的个数是固定的,还有单词可以是重复的;
思路
其实最简单的思路就是对字符串进行逐次遍历,先找到第一个匹配的单词,这又要去往单词数组中去遍历,也就是其复杂时间为(字符串的长度*单词数组的单词的个数),虽然这种方法较为简单,但是其实花销是比较大的,同时需要注意的地方也是比较多的。所以在我参考一些代码之后,发现一些好的方法-包括双map,使用队列,使用trie树等等;
实现
我的实现(最简单容易理解)
vector<int> findSubstring1(string s, vector<string>& words) {vector<int> result;size_t word_len = words[0].length();multimap<string, bool> maps;for (size_t j = 0; j < words.size(); j++) {maps.insert(make_pair(words[j], false));}for (size_t i = 0; i < s.length(); i++) {for (size_t j = 0; j < words.size(); j++) {for (auto beg = maps.lower_bound(words[j]), end = maps.upper_bound(words[j]); beg != end; ++beg) {beg->second = false;}}//先找到第一个单词在子串中的位置string subs = s.substr(i, word_len);size_t first_pos = -1;for (size_t j = 0; j < words.size(); j++) {if (words[j] == subs) {first_pos = i;auto item = maps.find(words[j]);item->second = true;}}//找第一个单词以后的所有单词,如果成功则返回开始的下标if (first_pos != -1) {size_t last_pos = first_pos + words.size() * word_len;bool isValid = true;size_t k = first_pos + word_len;for (; k < last_pos; k+=word_len) {if (k + word_len > s.length()) {isValid = false;break;}string osubs = s.substr(k, word_len);auto item = maps.find(osubs);auto itemcnt = maps.count(osubs);if (item != maps.end()) {if (item->second == false) {item->second = true;}else if (itemcnt > 1) {bool ishave = false;for (auto beg = ++item, end = maps.upper_bound(item->first); beg != end; ++beg) {if (!beg->second) {beg->second = true;ishave = true;break;}}// 全部已经访问过了if (!ishave) {isValid = false;}}else if (itemcnt == 1) {isValid = false;}}else {isValid = false;}}// 坐标位置不正确,不成功if (k != last_pos) {isValid = false;}//没有全部访问过,不成功for (size_t q = 0; q < words.size(); q++) {for (auto beg = maps.lower_bound(words[q]), end = maps.upper_bound(words[q]); beg != end; ++beg) {if (!beg->second) {isValid = false;break;}}}//成功则加入结果中if(isValid) {result.push_back((int)first_pos);}}}return result;
}
双map(最基础的优化)
/*** 默认的简化的方法,利用unorder_map进行判断,维护一个left值* 也就是全部单词字符串开始的地方** @param s <#s description#>* @param words <#words description#>** @return <#return value description#>*/
vector<int> findSubstring2(string s, vector<string>& words) {vector<int> ans;int n = s.size(), cnt = words.size();if (n <= 0 || cnt <= 0) {return ans;}// 单词的hash数组,初始化unordered_map<string, int> dict;for (int i = 0; i < cnt; ++i) dict[words[i]]++;int wl = words[0].length();for (int i = 0; i < wl; ++i) {// left为起始单词串的下标int left = i, count = 0;unordered_map<string, int> tdict;for (int j = i; j <= n - wl; j+=wl) {string str = s.substr(j, wl);// 计算单词数组中是否存在if (dict.count(str)) {tdict[str]++;// 计算已访问的单词个数if (tdict[str] <= dict[str]) {count++;}else {// 字符串中存在连续相同的单词,并且已经大于了单词数组中的个数,// 这时需要向右进行移动while (tdict[str] > dict[str]) {string str1 = s.substr(left, wl);tdict[str1]--;if (tdict[str1] < dict[str1]) {count--;}left += wl;}}//如果访问个数相同,则成功if (count == cnt) {ans.push_back(left);tdict[s.substr(left, wl)]--;count--;left += wl;}}else {// 失败,重新统计count = 0;tdict.clear();left += wl;}}}return ans;
}
使用队列
/*** 这个方法比较复杂,比较难想懂,* 利用每个单词对应一个队列,并且队列中存储每个单词出现的下标(初始情况均为-1)* 根据下标去判断该单词的访问情况,或者第一次访问(-1),或者第n次访问(下标)等等*/
typedef unordered_map<string, queue<int>> wordItr;
vector<int> findSubstring3(string s, vector<string>& words) {vector<int> res;if (words.size() == 0)return res;if (s.length() == 0)return res;int wordlen = words[0].size();if (s.size() < wordlen) return res;wordItr wordHash;wordItr::iterator it;queue<int> q;q.push(-1);// 对哈希表进行初始化,存在则往队列中添加-1for (int i = 0; i < words.size(); i++) {it = wordHash.find(words[i]);if (it == wordHash.end()) {wordHash[words[i]] = q;}else {it->second.push(-1);}}wordItr temp = wordHash;for (int i = 0; i < wordlen; i++) {int curWordCnt = 0; //已经访问单词的个数wordHash = temp;for (int j = i; j <= s.size() - wordlen; j += wordlen) {string str = s.substr(j, wordlen);it = wordHash.find(str);// 哈希数组里面是否存在字符串的keyif (it == wordHash.end()) {curWordCnt = 0;}else {// 访问队列int lastPos = it->second.front();// 如果为-1则表明第一次访问该单词if (lastPos == -1) {curWordCnt++;}// ??else if (curWordCnt * wordlen < j - lastPos) {curWordCnt++;}// 在访问完一次所有单词以后,重复出现该单词,该位置已经发生变化else {curWordCnt = (j - lastPos)/wordlen;}it->second.pop();it->second.push(j); //该单词出现的下标// 测试...queue<int> tque = it->second;while (!tque.empty()) {cout << it->first << "->" << tque.front();tque.pop();}cout << endl;// 当前访问单词个数已经访问完if (curWordCnt == words.size()) {res.push_back((int)(j - wordlen * (words.size() - 1)));}}}}return res;
}
Trie树
/*** 这个方法可能更难想到,因为是用的trie树,* 相较于前面的哈希,这里使用trie树进行适配** @param s <#s description#>* @param words <#words description#>** @return <#return value description#>*/
class TrieNode {
public:TrieNode* child[26];int cnt;TrieNode(): cnt(0) {memset(child, NULL, sizeof(TrieNode*) * 26);//分配空间}
};class Trie {TrieNode* root;
public:Trie() {root = new TrieNode();}TrieNode* getRoot() {return root;}void buildTrie(vector<string> words) {for (string word : words) {addWord(word);}}void addWord(string& word) {TrieNode* cur = root;for (int i = 0; i < word.size(); i++) {char m = word[i] - 'a';if (!cur->child[m]) {cur->child[m] = new TrieNode();}cur = cur->child[m];}cur->cnt++;}
};Trie* trie;
/*** 利用递归将字符串中的所有单词用trie树进行查找,找不到则表明不符合* 我觉得除了递归以外,也可以通过两个遍历,最外层为遍历单词的个数,移动单词长度,* 最内层循环为对每一个单词的进行Trie树的匹配;** @param s <#s description#>* @param start <#start description#>* @param end <#end description#>** @return <#return value description#>*/
bool isSubString1(string& s, int start, int end) {TrieNode* node = trie->getRoot();int idx;for (int i = start; i < end; i++) {idx = s[i] - 'a';if (!node->child[idx]) {return false;}node = node->child[idx];// 表明已经达到单词的末尾if (node->cnt > 0) {node->cnt--; //标记为已经使用if (i + 1 == end || isSubString1(s, i+1, end)) {node->cnt++; //标记为未使用return true;}node->cnt++; //标记为未使用}}return false;
}/*** 这个方法比较巧妙,利用trie树去匹配字符串中的所有单词** @param s <#s description#>* @param words <#words description#>** @return <#return value description#>*/
vector<int> findSubstring4(string s, vector<string>& words) {trie = new Trie();trie->buildTrie(words);int length = (int)words[0].size() * words.size();vector<int> result;for (int i = 0; i < s.length() - length; i++) {if (isSubString1(s, i, i+length)) {result.push_back(i);}}return result;
}
总结
我觉得无论是什么方法,都逃不掉对字符串的遍历,对单词的匹配,就是看这个过程可以进行多大的优化。