Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HeadParserClient.cs
James Tombleson 521940ca4f
Features/more rss improvements (#6)
* exposing connectionStrings to controllers

* First controller added to start testing

* corrected param to be page not age

* new model to map connection strings to for the controllers

* HelloWorldJob uses options now to make hangfire happy

* improved the html reader to find some rss feeds and start to extract the body of the content

* moved html parser to its own namespace and make a sub client to process theh header

* helpful vsc changes

* updated rss watcher to include the sourceId so it can be added to the db call

* updated tests to reflect changes

* updated gitignore to avoid trash and moved over my makefile

* More routes and added serilog

* adding more database calls for the controllers

* Updated interfaces for the tables

* Added Serilog to jobs

* removed default files

* Added more routes and added DTO

* Added DTO objects and SourceType Consts for easy usage

* updated discord model name to follow the pattern

* updated formatting

* new dto objects and Subscriptions repo interface

* added subscription db and api calls

* focusing on the twitter tags as most sites focus on them

* updated test to pull a html based feed
2023-02-26 09:40:04 -08:00

186 lines
5.1 KiB
C#

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;
namespace Newsbot.Collector.Services.HtmlParser;
public class HeadParserClient
{
private const string XPathMetaTag = "//head/meta";
private const string XPathLinkTag = "//head/link";
public HeadParserModel Data { get; set; }
private string _htmlContent;
public HeadParserClient(string htmlContent)
{
_htmlContent = htmlContent;
Data = new HeadParserModel();
}
public void Parse()
{
Data.Title = GetMetaTitle();
Data.Description = GetMetaDescription();
Data.Image = GetMetaImage();
Data.Url = GetMetaUrl();
Data.PageType = GetMetaPageType();
Data.ColorTheme = GetMetaColorTheme();
Data.FeedUri = GetSiteFeed();
}
private List<HtmlNode> CollectMetaTags()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_htmlContent);
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
return tags;
}
private List<HtmlNode> CollectLinkTags()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_htmlContent);
var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();
return links;
}
private string GetTagValue(string Tag, List<HtmlNode> html)
{
foreach (var meta in html)
{
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
if (meta.Attributes[0].Value.Contains(Tag) == false)
{
continue;
}
return meta.Attributes[1].Value;
}
return "";
}
private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)
{
foreach (var tag in tags)
{
var res = GetTagValue(tag, htmlTags);
if (res == "")
{
continue;
}
return res;
}
return "";
}
public string GetMetaTitle()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "twitter:title", "og:title", "title" };
return FindFirstResult(tags, htmlTags);
}
public string GetMetaDescription()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "twitter:description", "og:description", "description" };
return FindFirstResult(tags, htmlTags);
}
public string GetMetaImage()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "twitter:image", "og:image", "image" };
return FindFirstResult(tags, htmlTags);
}
public string GetMetaUrl()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "twitter:url", "og:url", "url" };
return FindFirstResult(tags, htmlTags);
}
public string GetMetaPageType()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "og:type", "type" };
return FindFirstResult(tags, htmlTags);
}
public string GetMetaColorTheme()
{
var htmlTags = CollectMetaTags();
string[] tags = new string[] { "theme-color" };
return FindFirstResult(tags, htmlTags);
}
/// <summary>
/// This will parse the headers looking for known keys that will contain a RSS feed link.
/// If the feed is not found, this will throw an exception (MissingHeaderValueException).
/// </summary>
/// <returns></returns>
public string GetSiteFeed()
{
var htmlTags = CollectLinkTags();
var tags = new string[] { "alternate" };
try
{
var attr = FindFirstAttribute(tags, htmlTags);
foreach (var item in attr)
{
if (item.Name != "href")
{
continue;
}
var uri = item.Value;
if (uri.StartsWith("//"))
{
uri = uri.Replace("//", "https://");
}
return uri;
}
return "";
}
catch
{
// not found
return "";
}
}
private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)
{
foreach (var tag in tags)
{
try
{
var res = GetValidAttribute(tag, htmlTags);
return res;
}
catch (MissingHeaderValueException)
{
// Nothing was found in the given tag but we will keep looking till we finish all the entries.
}
}
throw new MissingHeaderValueException("Unable to find the requested value");
}
private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)
{
foreach (var meta in html)
{
if (meta.Attributes[0].Value.Contains(Tag) == false)
{
continue;
}
return meta.Attributes;
}
throw new MissingHeaderValueException("Site does not expose requested tag.");
}
}