Word Distance
17.11 Word Distance: You have a large text file containing words. Given any two words, find the shortest distance (in terms of number of words) between them in the file. If the operation wi" be repeated many t imes for the same file (but different pairs of words), can you optimize your solution?
Complexity: O(n^3) preprocessing - for every combinational relationship between words i and j, find the minimum distance within their occurances, O(1) postprocessing - through matrix lookup. O(n^2) space from matrix
The Idea: First read in the file contents and build a dictionary of each word and where they occur. Build a matrix that provides lookup by id strings (aka 2d hash table), then for every combinational relationship between words compute the minimum distance. Computing the minimum distance requires a linear traversal between both lists within the assumption that the occurances for each word are sorted. Simply increment the list that will ensure a closer absolute difference. Then flip the matrix to provide the user-friendly lookup (word1,word2) == (word2,word1).
class Word_Distance
{
public:
Word_Distance(const string file_path) {
read_file(file_path);
init_matrix();
find_min_distances();
}
int find_min_distance(const string &word1, const string &word2)
{
if (word_matrix.find(word1) == word_matrix.end()
&& word_matrix.find(word2) == word_matrix.end()) {
cout << "word(s) not found" << endl;
return -1;
}
return word_matrix[word1][word2];
}
private:
unordered_map<string, vector<int>> word_occ;
unordered_map<string, unordered_map<string, int>> word_matrix;
const void read_file(const string file_path) {
ifstream file;
file.open(file_path);
if (file.is_open()) {
string word;
int index = 0;
while (file >> word) {
if (word_occ.find(word) == word_occ.end()) {
word_occ.insert({ word, vector<int>() });
}
word_occ[word].push_back(index);
index++;
}
}
}
void init_matrix() {
for (auto &m : word_occ) {
word_matrix.insert({ m.first, unordered_map <string, int>() });
}
for (auto &w : word_matrix) {
for (auto &w2 : word_matrix) {
word_matrix[w.first].insert({ w2.first, 0 });
}
}
}
void find_min_distances() {
vector<string> comb_pair;
comb_pair.reserve(word_occ.size());
for (auto &w : word_occ) {
comb_pair.push_back(w.first);
}
// do as little computations as possible
for (int i = 0; i < comb_pair.size(); i++) {
for (int j = i + 1; j < comb_pair.size(); j++) {
int sol = _find_min_distance(comb_pair[i], comb_pair[j]);
word_matrix[comb_pair[i]][comb_pair[j]] = sol;
word_matrix[comb_pair[j]][comb_pair[i]] = sol;
}
}
}
int _find_min_distance(const string &word1, const string &word2) {
// l1 and l2 are assumed sorted
vector<int> l1 = word_occ[word1];
vector<int> l2 = word_occ[word2];
int min_pair_dist = INT_MAX;
int iter1 = 0, iter2 = 0;
while (iter1 < l1.size() || iter2 < l2.size()) {
int cur_diff = abs(l1[iter1] - l2[iter2]);
min_pair_dist = min(min_pair_dist, cur_diff);
if (l1[iter1] < l2[iter2] && iter1 < l1.size() - 1)
iter1++;
else if (l1[iter1] >= l2[iter2] && iter2 < l2.size() - 1)
iter2++;
else break; // guarenteed to maintain minimum value
}
return min_pair_dist;
}
};
int main()
{
const string filepath = "../infile/rand.txt";
Word_Distance wd(filepath);
assert(wd.find_min_distance("the", "the") == 0);
assert(wd.find_min_distance("ten", "years") == 1);
assert(wd.find_min_distance("sisters", "enhanced") == 2);
assert(wd.find_min_distance("sisters", "each") == 3);
assert(wd.find_min_distance("race", "established") == 5);
assert(wd.find_min_distance("established", "race") == 5);
}Infile
Last updated
Was this helpful?