Newsbot.Collector/Newsbot.Collector.Services/HtmlMeta.cs
James Tombleson 9f3a6323a6
Features/html meta extractor (#4)
* gave api access to the db project

* added db models

* working on rss extraction and meta extraction

* test project to debug rsswatcherjob

* added new configs for the project

* new interface to define collectors

* basic rss extraction and article details are now exposed

* tests updated for rss pull

* starting to get dapper working.  Query works but insert seems to have a value issue

* removed dapper from services

* added some basic tests for db calls
2023-02-16 22:19:05 -08:00

129 lines
3.1 KiB
C#

using System.Data;
using System.Runtime.Serialization;
using System.Xml;
using HtmlAgilityPack;
namespace Newsbot.Collector.Services;
public class HtmlData
{
public HtmlHeaderData Header { get; set; } = new HtmlHeaderData();
}
public class HtmlHeaderData
{
public HtmlMetaData Meta { get; set; } = new HtmlMetaData();
}
public class HtmlMetaData
{
public string Title { get; set; } = "";
public string Description { get; set; } = "";
public string Image { get; set; } = "";
public string Url { get; set; } = "";
public string PageType { get; set; } = "";
//public string Color { get; set; }
}
public class HtmlPageReader
{
public HtmlData Data { get; set; }
private const string XPathMetaTag = "//head/meta";
private string _siteContent;
public HtmlPageReader(string pageUrl)
{
_siteContent = ReadSiteContent(pageUrl);
var tags = CollectMetaTags();
Data = new HtmlData();
Data.Header.Meta.Title = GetMetaTitle();
Data.Header.Meta.Description = GetDescription();
Data.Header.Meta.Image = GetImage();
Data.Header.Meta.Url = GetUrl();
Data.Header.Meta.PageType = GetPageType();
}
private string ReadSiteContent(string url)
{
using var client = new HttpClient();
var html = client.GetStringAsync(url);
html.Wait();
var content = html.Result.ToString();
return content;
}
private List<HtmlNode> CollectMetaTags()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_siteContent);
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
return tags;
}
public string GetTagValue(string Tag)
{
var tags = CollectMetaTags();
foreach (var meta in tags)
{
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
if (meta.Attributes[0].Value.Contains(Tag) == false)
{
continue;
}
return meta.Attributes[1].Value;
}
return "";
}
private string FindFirstResult(string[] tags)
{
foreach (var tag in tags)
{
var res = GetTagValue(tag);
if (res == "")
{
continue;
}
return res;
}
return "";
}
public string GetMetaTitle()
{
string[] tags = new string[] { "og:title", "twitter:title", "title" };
return FindFirstResult(tags);
}
public string GetDescription()
{
string[] tags = new string[] { "description", "og:description" };
return FindFirstResult(tags);
}
public string GetImage()
{
string[] tags = new string[] { "image", "og:image", "twitter:image" };
return FindFirstResult(tags);
}
public string GetUrl()
{
string[] tags = new string[] { "url", "og:url", "twitter:url" };
return FindFirstResult(tags);
}
public string GetPageType()
{
string[] tags = new string[] { "og:type", "type" };
return FindFirstResult(tags);
}
}