#include #include #include "../include/WordUtils.h" #include #include #include #include #include #include map> WordUtils::load_data(const char *path) { const char del[] = {' ', '.', ',', '!', '?', ';', ':', '(', ')', '\n', '\r', '\t'}; vector data = split_string(load_file(path), del); map> word_frequencies = {}; for (long unsigned int i = 0; i + 1 < data.size(); i++) { transform(data[i].begin(), data[i].end(), data[i].begin(), [](unsigned char c){ return tolower(c); }); transform(data[i+1].begin(), data[i+1].end(), data[i+1].begin(), [](unsigned char c){ return tolower(c); }); word_frequencies[data[i]][data[i+1]]++; } return word_frequencies; } vector WordUtils::predict_next_word(const string& input, const map>& word_frequencies, size_t count) { auto it = word_frequencies.find(input); if (it == word_frequencies.end()) return {input}; const map nextWords = it->second; vector> sortedWords(nextWords.begin(), nextWords.end()); // Sort by frequency (descending) sort(sortedWords.begin(), sortedWords.end(),[](const auto& a, const auto& b) { return a.second > b.second; }); vector results; // Take up to `count` most common words for (size_t i = 0; i < min(count, sortedWords.size()); ++i) { results.push_back(sortedWords[i].first); } return results; } vector WordUtils::create_tag(const map>& word_frequencies, unsigned int hash) { const vector beginning_sequences = {"the", "but", "with"}; const string start = "\nDrip\n"; const string end = "\n"; vector tags; const string hashes = to_string(hash); tags.push_back(start); int predict_num = 5; // Tags for (int i = 0; i < hashes.size(); i++) { vector inner_tags; minstd_rand generator((int)hashes[i]); uniform_int_distribution outer_distribution(0, beginning_sequences.size() - 1); vector temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num); uniform_int_distribution outer_2_distribution(0, temp_words.size() - 1); inner_tags.push_back(temp_words[outer_2_distribution(generator)]); // Words per tag for (int j = 0; j < 25; j++) { temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num); uniform_int_distribution inner_distribution(0, temp_words.size() - 1); if (temp_words.size() != 0) { temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num); uniform_int_distribution inner_2_distribution(0, temp_words.size() - 1); inner_tags.push_back(temp_words[inner_2_distribution(generator)]); } else { temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num); uniform_int_distribution inner_3_distribution(0, temp_words.size() - 1); inner_tags.push_back(temp_words[inner_3_distribution(generator)]); } } string temp_string = "

"; for (int l = 0; l < inner_tags.size(); l++) { temp_string += inner_tags[l] + " "; } temp_string += ".

"; tags.push_back(temp_string); } return tags; } vector WordUtils::split_string(const string& input, const char *delimiters) { vector data; size_t start = 0; size_t end = 0; // Create a string from the delimiters array string delimiter_string = " .,!?;:()\n\r\t"; while ((end = input.find_first_of(delimiter_string, start)) != string::npos) { if (end > start) { data.push_back(input.substr(start, end - start)); } start = end + 1; } // Add the last token, if any if (start < input.length()) { data.push_back(input.substr(start)); } return data; } string WordUtils::load_file(const char *path) { ifstream file(path); string temp; string data; while (getline(file, temp)) { data += temp; } file.close(); return data; } string WordUtils::extract_url(const string& input) { int first_line_end = input.find("\n"); if (first_line_end == string::npos) return ""; string first_line = input.substr(0, first_line_end); int method_end = first_line.find(' '); if (method_end == string::npos) return ""; int path_end = first_line.find(' ', method_end + 1); if (path_end == string::npos) return ""; return first_line.substr(method_end + 1, path_end - method_end - 1); } unsigned int WordUtils::hash_url(const string& input) { unsigned int hash = 0; for (char c : input) { hash += static_cast(c); } return hash; }