30 Substring with Concatenation of All Words

You are given a string, s, and a list of words, words, that are all of the same length. Find all starting indices of substring(s) in s that is a concatenation of each word in words exactly once and without any intervening characters.

For example, given:

s: "barfoothefoobarman"
words: ["foo", "bar"]

You should return the indices: [0,9]. (order does not matter).

The Idea: The following is a brute force procedure. First I conclude the size of the complete substring I should expect. Call is cmpl_s. Since all the words are the same length, and I know the amount - I can determine this. Then iterating through s, I take each cmpl_s and split it into equal size chunks. Finally, I confirm that all of these words exists within the word set uniquely. If so, the index i within the first level for loop gets recorded into the returning vector.

Complexity: O(N^2) time, O(N) space

unordered_map<string, int> getWordFreq(const vector<string> &words) {
    unordered_map<string, int> word_freq;
    word_freq.reserve(words.size());

    for (const string &s : words) {
        if (word_freq.find(s) != word_freq.end())
            word_freq[s]++;
        else word_freq.insert({ s, 1 });
    }

    return word_freq;
}

vector<int> findSubstring(string s, vector<string>& words) {
    vector<int> substringSeq;
    if (words.empty()) return substringSeq;

    const size_t word_length = words.at(0).length();
    unordered_map<string, int> word_freq = getWordFreq(words);
    unordered_map<string, int> word_freq_cp = word_freq;

    const int total_word_dist = words.size() * word_length;
    const int end = s.length() - total_word_dist + 1;

    for (int i = 0; i < end; i++) {
        string cmplt_words = s.substr(i, total_word_dist);

        for (int j = 0; j < total_word_dist; j += word_length) {
            string cur_word = cmplt_words.substr(j, word_length);
            if (word_freq_cp.find(cur_word) == word_freq_cp.end())
                break;
            else {
                word_freq_cp[cur_word]--;
                if (word_freq_cp[cur_word] == 0)
                    word_freq_cp.erase(cur_word);
            }
        }

        if (word_freq_cp.empty())
            substringSeq.push_back(i);

        word_freq_cp = word_freq;
    }

    return substringSeq;
}

Testing

int main()
{
    //           .        .
    string s1 = "barfoothefoobarman";
    vector<string> words1 = {"foo", "bar" };
    print(findSubstring(s1, words1));

    //           .     .
    string s2 = "barfoobarfoo";
    vector<string> words2 = { "foo", "bar" };
    print(findSubstring(s2, words2));

    //           0123456789012345678901234567890123456789
    //                     .        .     .       .
    string s3 = "fooybarmanfoobarmanfoobarbarfoooofoobarr";
    vector<string> words3 = { "foo", "bar" };
    print(findSubstring(s3, words3));

    //           012345678901234567890123
    //                 .  .  .
    string s4 = "barfoofoobarthefoobarman";
    vector<string> words4 = { "foo", "bar", "the" };
    print(findSubstring(s4, words4));

    //           012345678901234567890123
    //                   .           
    string s5 = "wordgoodgoodgoodbestword";
    vector<string> words5 = { "word","good","best","good" };
    print(findSubstring(s5, words5));
}

Last updated