AI-Tarpit-Reimagined/src/WordUtils.cpp

175 lines
5.1 KiB
C++

#include <string>
#include <fstream>
#include "../include/WordUtils.h"
#include <iostream>
#include <algorithm>
#include <cctype>
#include <cstring>
#include <cstdint>
#include <random>
map<string, map<string, int>> WordUtils::load_data(const char *path) {
const char del[] = {' ', '.', ',', '!', '?', ';', ':', '(', ')', '\n', '\r', '\t'};
vector<string> data = split_string(load_file(path), del);
map<string, map<string, int>> word_frequencies = {};
for (long unsigned int i = 0; i + 1 < data.size(); i++) {
transform(data[i].begin(), data[i].end(), data[i].begin(), [](unsigned char c){ return tolower(c); });
transform(data[i+1].begin(), data[i+1].end(), data[i+1].begin(), [](unsigned char c){ return tolower(c); });
word_frequencies[data[i]][data[i+1]]++;
}
return word_frequencies;
}
vector<string> WordUtils::predict_next_word(const string& input, const map<string, map<string, int>>& word_frequencies, size_t count) {
auto it = word_frequencies.find(input);
if (it == word_frequencies.end()) return {input};
const map<string, int> nextWords = it->second;
vector<pair<string, int>> sortedWords(nextWords.begin(), nextWords.end());
// Sort by frequency (descending)
sort(sortedWords.begin(), sortedWords.end(),[](const auto& a, const auto& b) {
return a.second > b.second;
});
vector<string> results;
// Take up to `count` most common words
for (size_t i = 0; i < min(count, sortedWords.size()); ++i) {
results.push_back(sortedWords[i].first);
}
return results;
}
vector<string> WordUtils::create_tag(const map<string, map<string, int>>& word_frequencies, unsigned int hash) {
const vector<string> beginning_sequences = {"the", "but", "with"};
const string start = "<!DOCTYPE html>\n<html><head><title>Drip</title></head><body>\n";
const string end = "</body></html>\n";
vector<string> tags;
const string hashes = to_string(hash);
tags.push_back(start);
int predict_num = 5;
// Tags
for (int i = 0; i < hashes.size(); i++)
{
vector<string> inner_tags;
minstd_rand generator((int)hashes[i]);
uniform_int_distribution<int> outer_distribution(0, beginning_sequences.size() - 1);
vector<string> temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num);
uniform_int_distribution<int> outer_2_distribution(0, temp_words.size() - 1);
inner_tags.push_back(temp_words[outer_2_distribution(generator)]);
// Words per tag
for (int j = 0; j < 25; j++)
{
temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num);
uniform_int_distribution<int> inner_distribution(0, temp_words.size() - 1);
if (temp_words.size() != 0)
{
temp_words = predict_next_word(inner_tags[j], word_frequencies, predict_num);
uniform_int_distribution<int> inner_2_distribution(0, temp_words.size() - 1);
inner_tags.push_back(temp_words[inner_2_distribution(generator)]);
} else {
temp_words = predict_next_word(beginning_sequences[outer_distribution(generator)], word_frequencies, predict_num);
uniform_int_distribution<int> inner_3_distribution(0, temp_words.size() - 1);
inner_tags.push_back(temp_words[inner_3_distribution(generator)]);
}
}
string temp_string = "<p>";
for (int l = 0; l < inner_tags.size(); l++)
{
temp_string += inner_tags[l] + " ";
}
temp_string += ".</p>";
tags.push_back(temp_string);
}
return tags;
}
vector<string> WordUtils::split_string(const string& input, const char *delimiters) {
vector<string> data;
size_t start = 0;
size_t end = 0;
// Create a string from the delimiters array
string delimiter_string = " .,!?;:()\n\r\t";
while ((end = input.find_first_of(delimiter_string, start)) != string::npos) {
if (end > start) {
data.push_back(input.substr(start, end - start));
}
start = end + 1;
}
// Add the last token, if any
if (start < input.length()) {
data.push_back(input.substr(start));
}
return data;
}
string WordUtils::load_file(const char *path) {
ifstream file(path);
string temp;
string data;
while (getline(file, temp)) {
data += temp;
}
file.close();
return data;
}
string WordUtils::extract_url(const string& input) {
int first_line_end = input.find("\n");
if (first_line_end == string::npos) return "";
string first_line = input.substr(0, first_line_end);
int method_end = first_line.find(' ');
if (method_end == string::npos) return "";
int path_end = first_line.find(' ', method_end + 1);
if (path_end == string::npos) return "";
return first_line.substr(method_end + 1, path_end - method_end - 1);
}
unsigned int WordUtils::hash_url(const string& input) {
unsigned int hash = 0;
for (char c : input) {
hash += static_cast<unsigned int>(c);
}
return hash;
}