diff --git a/Backend/Backend.csproj b/Backend/Backend.csproj index 5c26c51..95c5c1e 100644 --- a/Backend/Backend.csproj +++ b/Backend/Backend.csproj @@ -16,6 +16,7 @@ + diff --git a/Backend/Handler/ContentFilter.cs b/Backend/Handler/ContentFilter.cs index 8d78691..b6930b6 100644 --- a/Backend/Handler/ContentFilter.cs +++ b/Backend/Handler/ContentFilter.cs @@ -161,15 +161,15 @@ public class ContentFilter for (int i = 0; i < ports.Length; i++) { - string? html = ""; - if (ports[i] == 80) { if (string.IsNullOrWhiteSpace(url1)) continue; - + try { - html = HttpClientHelper.GetHtml(url1, 80).GetAwaiter().GetResult(); + (string, string) temp = HttpClientHelper.GetTitleAndDescription(url1, 80).GetAwaiter().GetResult(); + title1 = temp.Item1; + description1 = temp.Item2; } catch { @@ -179,10 +179,12 @@ public class ContentFilter else { if (string.IsNullOrWhiteSpace(url2)) continue; - + try { - html = HttpClientHelper.GetHtml(url2, 443).GetAwaiter().GetResult(); + (string, string) temp = HttpClientHelper.GetTitleAndDescription(url1, 443).GetAwaiter().GetResult(); + title2 = temp.Item1; + description2 = temp.Item2; } catch { @@ -190,12 +192,6 @@ public class ContentFilter } } - if (string.IsNullOrWhiteSpace(html)) continue; - - if (ports[i] == 80 && string.IsNullOrWhiteSpace(title1)) { FilterHelper.GetTitle(html, out title1); } - if (ports[i] == 443 && string.IsNullOrWhiteSpace(title2)) { FilterHelper.GetTitle(html ,out title2); } - if (ports[i] == 80 && string.IsNullOrWhiteSpace(description1)) { FilterHelper.GetDescription(html, out description1); } - if (ports[i] == 443 && string.IsNullOrWhiteSpace(description2)) { FilterHelper.GetDescription(html, out description2); } if (ports[i] == 80 && !robotsTxt1) { robotsTxt1 = HttpClientHelper.HasRobotsTxt(url1, 80).GetAwaiter().GetResult(); } if (ports[i] == 443 && !robotsTxt2) { robotsTxt2 = HttpClientHelper.HasRobotsTxt(url2, 443).GetAwaiter().GetResult(); } } diff --git a/Backend/Handler/ThreadHandler.cs b/Backend/Handler/ThreadHandler.cs index f92d076..784c270 100644 --- a/Backend/Handler/ThreadHandler.cs +++ b/Backend/Handler/ThreadHandler.cs @@ -59,7 +59,7 @@ public class ThreadHandler { Thread.Sleep(5000); // Let the database handler instantiate and warm up first. - List wait = _ipScanner.Start(32); + List wait = _ipScanner.Start(128); for (int i = 0; i < wait.Count; i++) { diff --git a/Backend/Helper/HttpClientHelper.cs b/Backend/Helper/HttpClientHelper.cs index 25c597a..444c16c 100644 --- a/Backend/Helper/HttpClientHelper.cs +++ b/Backend/Helper/HttpClientHelper.cs @@ -1,10 +1,18 @@ +using System.Text.RegularExpressions; +using HtmlAgilityPack; + namespace Backend.Helper; -public static class HttpClientHelper +public static partial class HttpClientHelper { + // Reddit, for example, will block the GET request if you don't have a user agent. private const string UserAgentHeader = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; + private const string TitlePattern = "(.*)"; + private const string DescriptionPattern = " GetHtml(string url, int port) + public static async Task<(string, string)> GetTitleAndDescription(string url, int port) { using HttpClient client = new(); @@ -29,15 +37,48 @@ public static class HttpClientHelper } catch { - return ""; + return ("", ""); } if (!response.IsSuccessStatusCode) { - return ""; + return ("", ""); + } + + string html = await response.Content.ReadAsStringAsync(); + + int firstIndex = 0; + int lastIndex = 0; + + if (html.Contains(StartHeadTag) && html.Contains(EndHeadTag)) + { + firstIndex = html.IndexOf(StartHeadTag, StringComparison.Ordinal); + lastIndex = html.IndexOf(EndHeadTag, StringComparison.Ordinal); } - return await response.Content.ReadAsStringAsync(); + string head = html.AsSpan().Slice(firstIndex, lastIndex).ToString(); + html = ""; + + string title = ""; + string description = ""; + + Regex titleRegex = TitleRegEx(); + Match titleMatch = titleRegex.Match(head); + + if (titleMatch.Success) + { + title = titleMatch.Groups[1].Value; + } + + Regex descriptionRegex = DexcriptionRegEx(); + Match descriptionMatch = descriptionRegex.Match(head); + + if (descriptionMatch.Success) + { + description = descriptionMatch.Groups[1].Value; + } + + return (title, description); } public static async Task HasRobotsTxt(string url, int port) @@ -70,4 +111,9 @@ public static class HttpClientHelper return response is not null && response.IsSuccessStatusCode; } + + [GeneratedRegex(TitlePattern)] + private static partial Regex TitleRegEx(); + [GeneratedRegex(DescriptionPattern)] + private static partial Regex DexcriptionRegEx(); } \ No newline at end of file diff --git a/RSE.sln b/RSE.sln index f528fd9..7851f8c 100644 --- a/RSE.sln +++ b/RSE.sln @@ -8,8 +8,6 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Manager", "Manager\Manager. EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Proxy", "Proxy\Proxy.csproj", "{55208481-5203-4B25-A20D-4EF644F76773}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared", "Shared\Shared.csproj", "{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}" -EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -32,9 +30,5 @@ Global {55208481-5203-4B25-A20D-4EF644F76773}.Debug|Any CPU.Build.0 = Debug|Any CPU {55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.ActiveCfg = Release|Any CPU {55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.Build.0 = Release|Any CPU - {DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Debug|Any CPU.Build.0 = Debug|Any CPU - {DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Release|Any CPU.ActiveCfg = Release|Any CPU - {DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/RSE.sln.DotSettings.user b/RSE.sln.DotSettings.user index a5706a8..37aa8f6 100644 --- a/RSE.sln.DotSettings.user +++ b/RSE.sln.DotSettings.user @@ -1,9 +1,11 @@  ForceIncluded + ForceIncluded ForceIncluded ForceIncluded ForceIncluded ForceIncluded ForceIncluded ForceIncluded + ForceIncluded ForceIncluded \ No newline at end of file diff --git a/Shared/Class1.cs b/Shared/Class1.cs deleted file mode 100644 index eb85986..0000000 --- a/Shared/Class1.cs +++ /dev/null @@ -1,5 +0,0 @@ -namespace Shared; - -public class Class1 -{ -} \ No newline at end of file diff --git a/Shared/Shared.csproj b/Shared/Shared.csproj deleted file mode 100644 index 3a63532..0000000 --- a/Shared/Shared.csproj +++ /dev/null @@ -1,9 +0,0 @@ - - - - net8.0 - enable - enable - - -