Title and description extraction now only look in the head section of the HTML document #40

Merged
owner merged 1 commits from OptimizeTitleAndDescriptionExtraction into main 2024-12-23 13:10:49 +00:00
8 changed files with 63 additions and 38 deletions

View File

@ -16,6 +16,7 @@
<ItemGroup> <ItemGroup>
<PackageReference Include="FuzzySharp" Version="2.0.2" /> <PackageReference Include="FuzzySharp" Version="2.0.2" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.71" />
<PackageReference Include="NetMQ" Version="4.0.1.13" /> <PackageReference Include="NetMQ" Version="4.0.1.13" />
</ItemGroup> </ItemGroup>

View File

@ -161,15 +161,15 @@ public class ContentFilter
for (int i = 0; i < ports.Length; i++) for (int i = 0; i < ports.Length; i++)
{ {
string? html = "";
if (ports[i] == 80) if (ports[i] == 80)
{ {
if (string.IsNullOrWhiteSpace(url1)) continue; if (string.IsNullOrWhiteSpace(url1)) continue;
try try
{ {
html = HttpClientHelper.GetHtml(url1, 80).GetAwaiter().GetResult(); (string, string) temp = HttpClientHelper.GetTitleAndDescription(url1, 80).GetAwaiter().GetResult();
title1 = temp.Item1;
description1 = temp.Item2;
} }
catch catch
{ {
@ -179,10 +179,12 @@ public class ContentFilter
else else
{ {
if (string.IsNullOrWhiteSpace(url2)) continue; if (string.IsNullOrWhiteSpace(url2)) continue;
try try
{ {
html = HttpClientHelper.GetHtml(url2, 443).GetAwaiter().GetResult(); (string, string) temp = HttpClientHelper.GetTitleAndDescription(url1, 443).GetAwaiter().GetResult();
title2 = temp.Item1;
description2 = temp.Item2;
} }
catch catch
{ {
@ -190,12 +192,6 @@ public class ContentFilter
} }
} }
if (string.IsNullOrWhiteSpace(html)) continue;
if (ports[i] == 80 && string.IsNullOrWhiteSpace(title1)) { FilterHelper.GetTitle(html, out title1); }
if (ports[i] == 443 && string.IsNullOrWhiteSpace(title2)) { FilterHelper.GetTitle(html ,out title2); }
if (ports[i] == 80 && string.IsNullOrWhiteSpace(description1)) { FilterHelper.GetDescription(html, out description1); }
if (ports[i] == 443 && string.IsNullOrWhiteSpace(description2)) { FilterHelper.GetDescription(html, out description2); }
if (ports[i] == 80 && !robotsTxt1) { robotsTxt1 = HttpClientHelper.HasRobotsTxt(url1, 80).GetAwaiter().GetResult(); } if (ports[i] == 80 && !robotsTxt1) { robotsTxt1 = HttpClientHelper.HasRobotsTxt(url1, 80).GetAwaiter().GetResult(); }
if (ports[i] == 443 && !robotsTxt2) { robotsTxt2 = HttpClientHelper.HasRobotsTxt(url2, 443).GetAwaiter().GetResult(); } if (ports[i] == 443 && !robotsTxt2) { robotsTxt2 = HttpClientHelper.HasRobotsTxt(url2, 443).GetAwaiter().GetResult(); }
} }

View File

@ -59,7 +59,7 @@ public class ThreadHandler
{ {
Thread.Sleep(5000); // Let the database handler instantiate and warm up first. Thread.Sleep(5000); // Let the database handler instantiate and warm up first.
List<WaitHandle[]> wait = _ipScanner.Start(32); List<WaitHandle[]> wait = _ipScanner.Start(128);
for (int i = 0; i < wait.Count; i++) for (int i = 0; i < wait.Count; i++)
{ {

View File

@ -1,10 +1,18 @@
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace Backend.Helper; namespace Backend.Helper;
public static class HttpClientHelper public static partial class HttpClientHelper
{ {
// Reddit, for example, will block the GET request if you don't have a user agent.
private const string UserAgentHeader = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; private const string UserAgentHeader = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
private const string TitlePattern = "<title>(.*)</title>";
private const string DescriptionPattern = "<meta name=\"description\" content=\"(.*?)\"";
private const string StartHeadTag = "<head>";
private const string EndHeadTag = "</head>";
public static async Task<string> GetHtml(string url, int port) public static async Task<(string, string)> GetTitleAndDescription(string url, int port)
{ {
using HttpClient client = new(); using HttpClient client = new();
@ -29,15 +37,48 @@ public static class HttpClientHelper
} }
catch catch
{ {
return ""; return ("", "");
} }
if (!response.IsSuccessStatusCode) if (!response.IsSuccessStatusCode)
{ {
return ""; return ("", "");
}
string html = await response.Content.ReadAsStringAsync();
int firstIndex = 0;
int lastIndex = 0;
if (html.Contains(StartHeadTag) && html.Contains(EndHeadTag))
{
firstIndex = html.IndexOf(StartHeadTag, StringComparison.Ordinal);
lastIndex = html.IndexOf(EndHeadTag, StringComparison.Ordinal);
} }
return await response.Content.ReadAsStringAsync(); string head = html.AsSpan().Slice(firstIndex, lastIndex).ToString();
html = "";
string title = "";
string description = "";
Regex titleRegex = TitleRegEx();
Match titleMatch = titleRegex.Match(head);
if (titleMatch.Success)
{
title = titleMatch.Groups[1].Value;
}
Regex descriptionRegex = DexcriptionRegEx();
Match descriptionMatch = descriptionRegex.Match(head);
if (descriptionMatch.Success)
{
description = descriptionMatch.Groups[1].Value;
}
return (title, description);
} }
public static async Task<bool> HasRobotsTxt(string url, int port) public static async Task<bool> HasRobotsTxt(string url, int port)
@ -70,4 +111,9 @@ public static class HttpClientHelper
return response is not null && response.IsSuccessStatusCode; return response is not null && response.IsSuccessStatusCode;
} }
[GeneratedRegex(TitlePattern)]
private static partial Regex TitleRegEx();
[GeneratedRegex(DescriptionPattern)]
private static partial Regex DexcriptionRegEx();
} }

View File

@ -8,8 +8,6 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Manager", "Manager\Manager.
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Proxy", "Proxy\Proxy.csproj", "{55208481-5203-4B25-A20D-4EF644F76773}" Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Proxy", "Proxy\Proxy.csproj", "{55208481-5203-4B25-A20D-4EF644F76773}"
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared", "Shared\Shared.csproj", "{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}"
EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU Debug|Any CPU = Debug|Any CPU
@ -32,9 +30,5 @@ Global
{55208481-5203-4B25-A20D-4EF644F76773}.Debug|Any CPU.Build.0 = Debug|Any CPU {55208481-5203-4B25-A20D-4EF644F76773}.Debug|Any CPU.Build.0 = Debug|Any CPU
{55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.ActiveCfg = Release|Any CPU {55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.ActiveCfg = Release|Any CPU
{55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.Build.0 = Release|Any CPU {55208481-5203-4B25-A20D-4EF644F76773}.Release|Any CPU.Build.0 = Release|Any CPU
{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DEB1411C-F45A-40DA-92F8-D9B9929DBA5B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
EndGlobal EndGlobal

View File

@ -1,9 +1,11 @@
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation"> <wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A02000002pdb1Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003F51b607df472a454cb6ed940749bbfdfd6000_003Fde_003F2c578873_003F02000002pdb1Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A02000002pdb1Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003F51b607df472a454cb6ed940749bbfdfd6000_003Fde_003F2c578873_003F02000002pdb1Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A02000009pdb7Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003Fbbcfe225942e4131bc589e82ae4b92ab9800_003Fc0_003Fa20d693d_003F02000009pdb7Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A0200000Cpdb6Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003Fb53c196a821648e4ae3b142a6ae58d7b9400_003Fa8_003F21a43479_003F0200000Cpdb6Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A0200000Cpdb6Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003Fb53c196a821648e4ae3b142a6ae58d7b9400_003Fa8_003F21a43479_003F0200000Cpdb6Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A02000011pdb3Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003F7c3ed02c2ce44598b7f304f8ac45e58f8600_003F6d_003F99b875d1_003F02000011pdb3Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003A02000011pdb3Low_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FDecompilerCache_003FILViewer_003F7c3ed02c2ce44598b7f304f8ac45e58f8600_003F6d_003F99b875d1_003F02000011pdb3Low_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003ADirectoryInfo_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003Fe4ec446cfe0489bc3ef68a45c6766d183e999ebdc657e94fb1ad059de2bb9_003FDirectoryInfo_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003ADirectoryInfo_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003Fe4ec446cfe0489bc3ef68a45c6766d183e999ebdc657e94fb1ad059de2bb9_003FDirectoryInfo_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AHttpResponseMessage_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F85e97f467d698c9e98eae9e3a1b39d58541173e57992d8f7111eabdd3db3526_003FHttpResponseMessage_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AHttpResponseMessage_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F85e97f467d698c9e98eae9e3a1b39d58541173e57992d8f7111eabdd3db3526_003FHttpResponseMessage_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003ARateLimitRule_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F8fbca8b1bca27d45830c443b2c773d979015ea216430366f285514a39fc0b9_003FRateLimitRule_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003ARateLimitRule_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F8fbca8b1bca27d45830c443b2c773d979015ea216430366f285514a39fc0b9_003FRateLimitRule_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AStartupExtensions_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F3ce5d581dd9cc0e4cdfd914e797ba2da05e894767d76b86f0515ef5226bac_003FStartupExtensions_002Ecs/@EntryIndexedValue">ForceIncluded</s:String> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AStartupExtensions_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F3ce5d581dd9cc0e4cdfd914e797ba2da05e894767d76b86f0515ef5226bac_003FStartupExtensions_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AString_002ESearching_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F49ee52518952e16b89adee3d6c9346ae6c74be268730f0497eb14b34b49d56c_003FString_002ESearching_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AThread_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F693e634d7742afaf486acd69d84fe2a9e1ee1b11ba84f29cd1d67668d20dd59_003FThread_002Ecs/@EntryIndexedValue">ForceIncluded</s:String></wpf:ResourceDictionary> <s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AThread_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003F_002Econfig_003FJetBrains_003FRider2024_002E2_003Fresharper_002Dhost_003FSourcesCache_003F693e634d7742afaf486acd69d84fe2a9e1ee1b11ba84f29cd1d67668d20dd59_003FThread_002Ecs/@EntryIndexedValue">ForceIncluded</s:String></wpf:ResourceDictionary>

View File

@ -1,5 +0,0 @@
namespace Shared;
public class Class1
{
}

View File

@ -1,9 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>