119 lines
3.5 KiB
C#
119 lines
3.5 KiB
C#
using System.Text.RegularExpressions;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace Backend.Helper;
|
|
|
|
public static partial class HttpClientHelper
|
|
{
|
|
// Reddit, for example, will block the GET request if you don't have a user agent.
|
|
private const string UserAgentHeader = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
private const string TitlePattern = "<title>(.*)</title>";
|
|
private const string DescriptionPattern = "<meta name=\"description\" content=\"(.*?)\"";
|
|
private const string StartHeadTag = "<head>";
|
|
private const string EndHeadTag = "</head>";
|
|
|
|
public static async Task<(string, string)> GetTitleAndDescription(string url, int port)
|
|
{
|
|
using HttpClient client = new();
|
|
|
|
if (port == 80)
|
|
{
|
|
client.BaseAddress = new($"http://{url}");
|
|
}
|
|
else
|
|
{
|
|
client.BaseAddress = new($"https://{url}");
|
|
}
|
|
|
|
client.DefaultRequestHeaders.Accept.Clear();
|
|
client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgentHeader);
|
|
client.Timeout = TimeSpan.FromSeconds(30);
|
|
|
|
HttpResponseMessage? response;
|
|
|
|
try
|
|
{
|
|
response = await client.GetAsync("/");
|
|
}
|
|
catch
|
|
{
|
|
return ("", "");
|
|
}
|
|
|
|
if (!response.IsSuccessStatusCode)
|
|
{
|
|
return ("", "");
|
|
}
|
|
|
|
string html = await response.Content.ReadAsStringAsync();
|
|
|
|
int firstIndex = 0;
|
|
int lastIndex = 0;
|
|
|
|
if (html.Contains(StartHeadTag) && html.Contains(EndHeadTag))
|
|
{
|
|
firstIndex = html.IndexOf(StartHeadTag, StringComparison.Ordinal);
|
|
lastIndex = html.IndexOf(EndHeadTag, StringComparison.Ordinal);
|
|
}
|
|
|
|
string head = html.AsSpan().Slice(firstIndex, lastIndex).ToString();
|
|
html = "";
|
|
|
|
string title = "";
|
|
string description = "";
|
|
|
|
Regex titleRegex = TitleRegEx();
|
|
Match titleMatch = titleRegex.Match(head);
|
|
|
|
if (titleMatch.Success)
|
|
{
|
|
title = titleMatch.Groups[1].Value;
|
|
}
|
|
|
|
Regex descriptionRegex = DexcriptionRegEx();
|
|
Match descriptionMatch = descriptionRegex.Match(head);
|
|
|
|
if (descriptionMatch.Success)
|
|
{
|
|
description = descriptionMatch.Groups[1].Value;
|
|
}
|
|
|
|
return (title, description);
|
|
}
|
|
|
|
public static async Task<bool> HasRobotsTxt(string url, int port)
|
|
{
|
|
using HttpClient client = new();
|
|
|
|
if (port == 80)
|
|
{
|
|
client.BaseAddress = new($"http://{url}");
|
|
}
|
|
else
|
|
{
|
|
client.BaseAddress = new($"https://{url}");
|
|
}
|
|
|
|
client.DefaultRequestHeaders.Accept.Clear();
|
|
client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgentHeader);
|
|
client.Timeout = TimeSpan.FromSeconds(30);
|
|
|
|
HttpResponseMessage? response = null;
|
|
|
|
try
|
|
{
|
|
response = await client.SendAsync(new(HttpMethod.Head, "/robots.txt"));
|
|
}
|
|
catch
|
|
{
|
|
//
|
|
}
|
|
|
|
return response is not null && response.IsSuccessStatusCode;
|
|
}
|
|
|
|
[GeneratedRegex(TitlePattern)]
|
|
private static partial Regex TitleRegEx();
|
|
[GeneratedRegex(DescriptionPattern)]
|
|
private static partial Regex DexcriptionRegEx();
|
|
} |