2023-02-26 09:40:04 -08:00
|
|
|
using HtmlAgilityPack;
|
|
|
|
using Newsbot.Collector.Domain.Exceptions;
|
|
|
|
|
|
|
|
namespace Newsbot.Collector.Services.HtmlParser;
|
|
|
|
|
|
|
|
public class HeadParserClient
|
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
private const string XPathHeadMetaTag = "//head/meta";
|
|
|
|
private const string XPathBodyMetaTag = "//body/meta";
|
2023-02-26 09:40:04 -08:00
|
|
|
private const string XPathLinkTag = "//head/link";
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
private readonly string _htmlContent;
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-03-05 22:33:41 -08:00
|
|
|
public HeadParserClient(string htmlContent, bool useBrowser = false)
|
2023-02-26 09:40:04 -08:00
|
|
|
{
|
|
|
|
_htmlContent = htmlContent;
|
|
|
|
Data = new HeadParserModel();
|
|
|
|
}
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
public HeadParserModel Data { get; set; }
|
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
public void Parse()
|
|
|
|
{
|
|
|
|
Data.Title = GetMetaTitle();
|
|
|
|
Data.Description = GetMetaDescription();
|
|
|
|
Data.Image = GetMetaImage();
|
|
|
|
Data.Url = GetMetaUrl();
|
|
|
|
Data.PageType = GetMetaPageType();
|
|
|
|
Data.ColorTheme = GetMetaColorTheme();
|
|
|
|
|
|
|
|
Data.FeedUri = GetSiteFeed();
|
2023-03-05 22:33:41 -08:00
|
|
|
Data.YoutubeChannelID = GetYouTubeChannelId();
|
2023-02-26 09:40:04 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
private List<HtmlNode> CollectMetaTags()
|
|
|
|
{
|
|
|
|
var htmlDoc = new HtmlDocument();
|
|
|
|
htmlDoc.LoadHtml(_htmlContent);
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
var allTags = new List<HtmlNode>();
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
var headerTags = htmlDoc.DocumentNode.SelectNodes(XPathHeadMetaTag).ToList();
|
|
|
|
allTags.AddRange(headerTags);
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
var bodyTags = htmlDoc.DocumentNode.SelectNodes(XPathBodyMetaTag).ToList();
|
|
|
|
allTags.AddRange(bodyTags);
|
|
|
|
}
|
|
|
|
catch
|
|
|
|
{
|
|
|
|
// no tags found in the body and that's ok.
|
|
|
|
// we check the body thanks to Youtube.
|
|
|
|
}
|
|
|
|
|
|
|
|
return allTags;
|
2023-02-26 09:40:04 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
private List<HtmlNode> CollectLinkTags()
|
|
|
|
{
|
|
|
|
var htmlDoc = new HtmlDocument();
|
|
|
|
htmlDoc.LoadHtml(_htmlContent);
|
|
|
|
var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();
|
|
|
|
return links;
|
|
|
|
}
|
|
|
|
|
|
|
|
private string GetTagValue(string Tag, List<HtmlNode> html)
|
|
|
|
{
|
|
|
|
foreach (var meta in html)
|
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
if (meta.Attributes.Count == 0) continue;
|
|
|
|
;
|
2023-02-26 09:40:04 -08:00
|
|
|
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
|
2023-03-31 22:49:39 -07:00
|
|
|
if (meta.Attributes[0].Value.Contains(Tag) == false) continue;
|
2023-02-26 09:40:04 -08:00
|
|
|
return meta.Attributes[1].Value;
|
|
|
|
}
|
2023-03-31 22:49:39 -07:00
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)
|
|
|
|
{
|
|
|
|
foreach (var tag in tags)
|
|
|
|
{
|
|
|
|
var res = GetTagValue(tag, htmlTags);
|
2023-03-31 22:49:39 -07:00
|
|
|
if (res == "") continue;
|
2023-02-26 09:40:04 -08:00
|
|
|
return res;
|
|
|
|
}
|
2023-03-31 22:49:39 -07:00
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaTitle()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "twitter:title", "og:title", "title" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaDescription()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "twitter:description", "og:description", "description" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaImage()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "twitter:image", "og:image", "image" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaUrl()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "twitter:url", "og:url", "url" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaPageType()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "og:type", "type" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetMetaColorTheme()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "theme-color" };
|
2023-02-26 09:40:04 -08:00
|
|
|
return FindFirstResult(tags, htmlTags);
|
|
|
|
}
|
|
|
|
|
2023-03-05 22:33:41 -08:00
|
|
|
public string GetYouTubeChannelId()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectMetaTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
string[] tags = { "og:url", "channelId" };
|
|
|
|
var results = FindFirstResult(tags, htmlTags);
|
|
|
|
var id = results.Replace("https://www.youtube.com/channel/", "");
|
|
|
|
return id;
|
2023-03-05 22:33:41 -08:00
|
|
|
}
|
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
/// <summary>
|
2023-03-31 22:49:39 -07:00
|
|
|
/// This will parse the headers looking for known keys that will contain a RSS feed link.
|
|
|
|
/// If the feed is not found, this will throw an exception (MissingHeaderValueException).
|
2023-02-26 09:40:04 -08:00
|
|
|
/// </summary>
|
|
|
|
/// <returns></returns>
|
|
|
|
public string GetSiteFeed()
|
|
|
|
{
|
|
|
|
var htmlTags = CollectLinkTags();
|
2023-03-31 22:49:39 -07:00
|
|
|
var tags = new[] { "alternate" };
|
2023-02-26 09:40:04 -08:00
|
|
|
try
|
|
|
|
{
|
|
|
|
var attr = FindFirstAttribute(tags, htmlTags);
|
|
|
|
foreach (var item in attr)
|
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
if (item.Name != "href") continue;
|
2023-02-26 09:40:04 -08:00
|
|
|
|
|
|
|
var uri = item.Value;
|
2023-03-31 22:49:39 -07:00
|
|
|
if (uri.StartsWith("//")) uri = uri.Replace("//", "https://");
|
2023-02-26 09:40:04 -08:00
|
|
|
|
|
|
|
return uri;
|
|
|
|
}
|
2023-03-31 22:49:39 -07:00
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
catch
|
|
|
|
{
|
|
|
|
// not found
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)
|
|
|
|
{
|
|
|
|
foreach (var tag in tags)
|
|
|
|
try
|
|
|
|
{
|
|
|
|
var res = GetValidAttribute(tag, htmlTags);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
catch (MissingHeaderValueException)
|
|
|
|
{
|
|
|
|
// Nothing was found in the given tag but we will keep looking till we finish all the entries.
|
|
|
|
}
|
2023-03-31 22:49:39 -07:00
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
throw new MissingHeaderValueException("Unable to find the requested value");
|
|
|
|
}
|
|
|
|
|
|
|
|
private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)
|
|
|
|
{
|
|
|
|
foreach (var meta in html)
|
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
if (meta.Attributes[0].Value.Contains(Tag) == false) continue;
|
2023-02-26 09:40:04 -08:00
|
|
|
return meta.Attributes;
|
|
|
|
}
|
2023-03-31 22:49:39 -07:00
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
throw new MissingHeaderValueException("Site does not expose requested tag.");
|
|
|
|
}
|
|
|
|
}
|