175 lines
5.1 KiB
C++
175 lines
5.1 KiB
C++
#include <string>
|
|
#include <fstream>
|
|
#include "../include/WordUtils.h"
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <cstring>
|
|
#include <cstdint>
|
|
#include <random>
|
|
|
|
map<string, map<string, int>> WordUtils::load_data(const char *path) {
|
|
const char del[] = {' ', '.', ',', '!', '?', ';', ':', '(', ')', '\n', '\r', '\t'};
|
|
|
|
vector<string> data = split_string(load_file(path), del);
|
|
|
|
map<string, map<string, int>> word_frequencies = {};
|
|
|
|
for (long unsigned int i = 0; i + 1 < data.size(); i++) {
|
|
transform(data[i].begin(), data[i].end(), data[i].begin(), [](unsigned char c){ return tolower(c); });
|
|
transform(data[i+1].begin(), data[i+1].end(), data[i+1].begin(), [](unsigned char c){ return tolower(c); });
|
|
word_frequencies[data[i]][data[i+1]]++;
|
|
}
|
|
|
|
return word_frequencies;
|
|
}
|
|
|
|
vector<string> WordUtils::predict_next_word(const string& input, const map<string, map<string, int>>& word_frequencies, size_t count) {
|
|
auto it = word_frequencies.find(input);
|
|
|
|
if (it == word_frequencies.end()) return {input};
|
|
|
|
const map<string, int> nextWords = it->second;
|
|
|
|
vector<pair<string, int>> sortedWords(nextWords.begin(), nextWords.end());
|
|
|
|
// Sort by frequency (descending)
|
|
sort(sortedWords.begin(), sortedWords.end(),[](const auto& a, const auto& b) {
|
|
return a.second > b.second;
|
|
});
|
|
|
|
vector<string> results;
|
|
|
|
// Take up to `count` most common words
|
|
for (size_t i = 0; i < min(count, sortedWords.size()); ++i) {
|
|
results.push_back(sortedWords[i].first);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
vector<string> WordUtils::create_tag(const map<string, map<string, int>>& word_frequencies, unsigned int hash) {
|
|
const vector<string> beginning_sequences = {"the", "but", "with"};
|
|
const string start = "<!DOCTYPE html>\n<html><head><title>Drip</title></head><body>\n";
|
|
const string end = "</body></html>\n";
|
|
vector<string> tags;
|
|
|
|
const string hashes = to_string(hash);
|
|
|
|
tags.push_back(start);
|
|
|
|
int predict_num = 5;
|
|
|
|
// Tags
|
|
for (int i = 0; i < hashes.size(); i++)
|
|
{
|
|
vector<string> inner_tags;
|
|
|
|
minstd_rand generator((int)hashes[i]);
|
|
|
|
uniform_int_distribution<int> outer_distribution(0, beginning_sequences.size() - 1);
|
|
vector<string> temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num);
|
|
|
|
uniform_int_distribution<int> outer_2_distribution(0, temp_words.size() - 1);
|
|
|
|
inner_tags.push_back(temp_words[outer_2_distribution(generator)]);
|
|
|
|
// Words per tag
|
|
for (int j = 0; j < 25; j++)
|
|
{
|
|
temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num);
|
|
|
|
uniform_int_distribution<int> inner_distribution(0, temp_words.size() - 1);
|
|
|
|
if (temp_words.size() != 0)
|
|
{
|
|
temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num);
|
|
|
|
uniform_int_distribution<int> inner_2_distribution(0, temp_words.size() - 1);
|
|
|
|
inner_tags.push_back(temp_words[inner_2_distribution(generator)]);
|
|
} else {
|
|
temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num);
|
|
|
|
uniform_int_distribution<int> inner_3_distribution(0, temp_words.size() - 1);
|
|
|
|
inner_tags.push_back(temp_words[inner_3_distribution(generator)]);
|
|
}
|
|
}
|
|
|
|
string temp_string = "<p>";
|
|
for (int l = 0; l < inner_tags.size(); l++)
|
|
{
|
|
temp_string += inner_tags[l] + " ";
|
|
}
|
|
|
|
temp_string += ".</p>";
|
|
|
|
tags.push_back(temp_string);
|
|
}
|
|
|
|
|
|
return tags;
|
|
}
|
|
|
|
vector<string> WordUtils::split_string(const string& input, const char *delimiters) {
|
|
vector<string> data;
|
|
|
|
size_t start = 0;
|
|
size_t end = 0;
|
|
|
|
// Create a string from the delimiters array
|
|
string delimiter_string = " .,!?;:()\n\r\t";
|
|
|
|
while ((end = input.find_first_of(delimiter_string, start)) != string::npos) {
|
|
if (end > start) {
|
|
data.push_back(input.substr(start, end - start));
|
|
}
|
|
|
|
start = end + 1;
|
|
}
|
|
|
|
// Add the last token, if any
|
|
if (start < input.length()) {
|
|
data.push_back(input.substr(start));
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
string WordUtils::load_file(const char *path) {
|
|
ifstream file(path);
|
|
string temp;
|
|
string data;
|
|
|
|
while (getline(file, temp)) {
|
|
data += temp;
|
|
}
|
|
|
|
file.close();
|
|
|
|
return data;
|
|
}
|
|
|
|
string WordUtils::extract_url(const string& input) {
|
|
int first_line_end = input.find("\n");
|
|
if (first_line_end == string::npos) return "";
|
|
|
|
string first_line = input.substr(0, first_line_end);
|
|
|
|
int method_end = first_line.find(' ');
|
|
if (method_end == string::npos) return "";
|
|
|
|
int path_end = first_line.find(' ', method_end + 1);
|
|
if (path_end == string::npos) return "";
|
|
|
|
return first_line.substr(method_end + 1, path_end - method_end - 1);
|
|
}
|
|
|
|
unsigned int WordUtils::hash_url(const string& input) {
|
|
unsigned int hash = 0;
|
|
for (char c : input) {
|
|
hash += static_cast<unsigned int>(c);
|
|
}
|
|
return hash;
|
|
} |