James Tombleson
9f3a6323a6
* gave api access to the db project * added db models * working on rss extraction and meta extraction * test project to debug rsswatcherjob * added new configs for the project * new interface to define collectors * basic rss extraction and article details are now exposed * tests updated for rss pull * starting to get dapper working. Query works but insert seems to have a value issue * removed dapper from services * added some basic tests for db calls
129 lines
3.1 KiB
C#
129 lines
3.1 KiB
C#
using System.Data;
|
|
using System.Runtime.Serialization;
|
|
using System.Xml;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace Newsbot.Collector.Services;
|
|
|
|
public class HtmlData
|
|
{
|
|
public HtmlHeaderData Header { get; set; } = new HtmlHeaderData();
|
|
}
|
|
|
|
public class HtmlHeaderData
|
|
{
|
|
public HtmlMetaData Meta { get; set; } = new HtmlMetaData();
|
|
}
|
|
|
|
public class HtmlMetaData
|
|
{
|
|
public string Title { get; set; } = "";
|
|
public string Description { get; set; } = "";
|
|
public string Image { get; set; } = "";
|
|
public string Url { get; set; } = "";
|
|
public string PageType { get; set; } = "";
|
|
//public string Color { get; set; }
|
|
}
|
|
|
|
public class HtmlPageReader
|
|
{
|
|
|
|
public HtmlData Data { get; set; }
|
|
|
|
private const string XPathMetaTag = "//head/meta";
|
|
|
|
private string _siteContent;
|
|
|
|
public HtmlPageReader(string pageUrl)
|
|
{
|
|
_siteContent = ReadSiteContent(pageUrl);
|
|
var tags = CollectMetaTags();
|
|
|
|
Data = new HtmlData();
|
|
Data.Header.Meta.Title = GetMetaTitle();
|
|
Data.Header.Meta.Description = GetDescription();
|
|
Data.Header.Meta.Image = GetImage();
|
|
Data.Header.Meta.Url = GetUrl();
|
|
Data.Header.Meta.PageType = GetPageType();
|
|
}
|
|
|
|
private string ReadSiteContent(string url)
|
|
{
|
|
using var client = new HttpClient();
|
|
var html = client.GetStringAsync(url);
|
|
html.Wait();
|
|
|
|
var content = html.Result.ToString();
|
|
return content;
|
|
}
|
|
|
|
private List<HtmlNode> CollectMetaTags()
|
|
{
|
|
var htmlDoc = new HtmlDocument();
|
|
htmlDoc.LoadHtml(_siteContent);
|
|
|
|
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
|
|
|
|
return tags;
|
|
}
|
|
|
|
public string GetTagValue(string Tag)
|
|
{
|
|
var tags = CollectMetaTags();
|
|
|
|
foreach (var meta in tags)
|
|
{
|
|
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
|
|
if (meta.Attributes[0].Value.Contains(Tag) == false)
|
|
{
|
|
continue;
|
|
}
|
|
return meta.Attributes[1].Value;
|
|
}
|
|
return "";
|
|
}
|
|
|
|
private string FindFirstResult(string[] tags)
|
|
{
|
|
foreach (var tag in tags)
|
|
{
|
|
var res = GetTagValue(tag);
|
|
if (res == "")
|
|
{
|
|
continue;
|
|
}
|
|
return res;
|
|
}
|
|
return "";
|
|
}
|
|
|
|
public string GetMetaTitle()
|
|
{
|
|
string[] tags = new string[] { "og:title", "twitter:title", "title" };
|
|
return FindFirstResult(tags);
|
|
}
|
|
|
|
public string GetDescription()
|
|
{
|
|
string[] tags = new string[] { "description", "og:description" };
|
|
return FindFirstResult(tags);
|
|
}
|
|
|
|
public string GetImage()
|
|
{
|
|
string[] tags = new string[] { "image", "og:image", "twitter:image" };
|
|
return FindFirstResult(tags);
|
|
}
|
|
|
|
public string GetUrl()
|
|
{
|
|
string[] tags = new string[] { "url", "og:url", "twitter:url" };
|
|
return FindFirstResult(tags);
|
|
}
|
|
|
|
public string GetPageType()
|
|
{
|
|
string[] tags = new string[] { "og:type", "type" };
|
|
return FindFirstResult(tags);
|
|
}
|
|
} |