1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
| Map<string, double> kGramsIn(const string& str, int kGramLength) { Map<string, double> res; if (kGramLength <= 0) { error("kGramLength should be greater than zero."); } else { for (int i = 0; i + kGramLength - 1 < str.length(); ++i) { string tmp = ""; for (int j = 0; j < kGramLength; ++j) { tmp += str[i + j]; } res[tmp] += 1; } return res; } }
Map<string, double> normalize(const Map<string, double>& input) { double squareSum = 0.0; for (auto key : input) { double v = input[key]; squareSum += v * v; } if (squareSum == 0.0) { error("The input map is meaningless."); } Map<string, double> res; for (auto key : input) { res[key] = input[key] / sqrt(squareSum); } return res; }
Map<string, double> topKGramsIn(const Map<string, double>& source, int numToKeep) { if (numToKeep < 0) { error("numToKeep should be positive."); } PriorityQueue<string> pq; for (auto key : source) { pq.enqueue(key, source[key]); } Map<string, double> res; int numToDiscard = source.size() - numToKeep; while (pq.size() > 0) { string gram = pq.dequeue(); if (numToDiscard <= 0) { res[gram] = source[gram]; } else { --numToDiscard; } } return res; }
double cosineSimilarityOf(const Map<string, double>& lhs, const Map<string, double>& rhs) { Set<string> both; for (auto key : lhs) { if (rhs.containsKey(key)) { both.add(key); } } double res = 0.0; for (auto key : both) { res += lhs[key] * rhs[key]; } return res; }
string guessLanguageOf(const Map<string, double>& textProfile, const Set<Corpus>& corpora) { if (corpora.size() == 0) { error("We need more corpus."); } string ans; double curSimilarity = 0.0; for (auto corpus : corpora) { double similarity = cosineSimilarityOf(textProfile, corpus.profile); if (curSimilarity < similarity) { ans = corpus.name; curSimilarity = similarity; } } return ans; }
|