using System.Text.RegularExpressions; namespace Backend.Helper; public static partial class HttpClientHelper { // Reddit, for example, will block the GET request if you don't have a user agent. private const string UserAgentHeader = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"; private const string TitlePattern = "(.*)"; private const string DescriptionPattern = " GetTitleAndDescription(string url, int port) { using HttpClient client = new(); if (port == 80) { client.BaseAddress = new($"http://{url}"); } else { client.BaseAddress = new($"https://{url}"); } client.DefaultRequestHeaders.Accept.Clear(); client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgentHeader); client.Timeout = TimeSpan.FromSeconds(30); HttpResponseMessage? response; try { response = await client.GetAsync("/"); } catch { return ("", ""); } if (!response.IsSuccessStatusCode) { return ("", ""); } string html = await response.Content.ReadAsStringAsync(); int firstIndex = 0; int lastIndex = 0; if (html.Contains(StartHeadTag) && html.Contains(EndHeadTag)) { firstIndex = html.IndexOf(StartHeadTag, StringComparison.Ordinal); lastIndex = html.IndexOf(EndHeadTag, StringComparison.Ordinal); } string head = html.AsSpan().Slice(firstIndex, lastIndex).ToString(); html = ""; string title = ""; string description = ""; Regex titleRegex = TitleRegEx(); Match titleMatch = titleRegex.Match(head); if (titleMatch.Success) { title = titleMatch.Groups[1].Value; } Regex descriptionRegex = DexcriptionRegEx(); Match descriptionMatch = descriptionRegex.Match(head); if (descriptionMatch.Success) { description = descriptionMatch.Groups[1].Value; } return (title, description); } public static async Task HasRobotsTxt(string url, int port) { using HttpClient client = new(); if (port == 80) { client.BaseAddress = new($"http://{url}"); } else { client.BaseAddress = new($"https://{url}"); } client.DefaultRequestHeaders.Accept.Clear(); client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgentHeader); client.Timeout = TimeSpan.FromSeconds(30); HttpResponseMessage? response = null; try { response = await client.SendAsync(new(HttpMethod.Head, "/robots.txt")); } catch { // } return response is not null && response.IsSuccessStatusCode; } [GeneratedRegex(TitlePattern)] private static partial Regex TitleRegEx(); [GeneratedRegex(DescriptionPattern)] private static partial Regex DexcriptionRegEx(); }