Newsbot.Collector/Newsbot.Collector.Services/HtmlMeta.cs

129 lines
3.1 KiB
C#
Raw Normal View History

using System.Data;
using System.Runtime.Serialization;
using System.Xml;
using HtmlAgilityPack;
namespace Newsbot.Collector.Services;
public class HtmlData
{
public HtmlHeaderData Header { get; set; } = new HtmlHeaderData();
}
public class HtmlHeaderData
{
public HtmlMetaData Meta { get; set; } = new HtmlMetaData();
}
public class HtmlMetaData
{
public string Title { get; set; } = "";
public string Description { get; set; } = "";
public string Image { get; set; } = "";
public string Url { get; set; } = "";
public string PageType { get; set; } = "";
//public string Color { get; set; }
}
public class HtmlPageReader
{
public HtmlData Data { get; set; }
private const string XPathMetaTag = "//head/meta";
private string _siteContent;
public HtmlPageReader(string pageUrl)
{
_siteContent = ReadSiteContent(pageUrl);
var tags = CollectMetaTags();
Data = new HtmlData();
Data.Header.Meta.Title = GetMetaTitle();
Data.Header.Meta.Description = GetDescription();
Data.Header.Meta.Image = GetImage();
Data.Header.Meta.Url = GetUrl();
Data.Header.Meta.PageType = GetPageType();
}
private string ReadSiteContent(string url)
{
using var client = new HttpClient();
var html = client.GetStringAsync(url);
html.Wait();
var content = html.Result.ToString();
return content;
}
private List<HtmlNode> CollectMetaTags()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_siteContent);
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
return tags;
}
public string GetTagValue(string Tag)
{
var tags = CollectMetaTags();
foreach (var meta in tags)
{
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
if (meta.Attributes[0].Value.Contains(Tag) == false)
{
continue;
}
return meta.Attributes[1].Value;
}
return "";
}
private string FindFirstResult(string[] tags)
{
foreach (var tag in tags)
{
var res = GetTagValue(tag);
if (res == "")
{
continue;
}
return res;
}
return "";
}
public string GetMetaTitle()
{
string[] tags = new string[] { "og:title", "twitter:title", "title" };
return FindFirstResult(tags);
}
public string GetDescription()
{
string[] tags = new string[] { "description", "og:description" };
return FindFirstResult(tags);
}
public string GetImage()
{
string[] tags = new string[] { "image", "og:image", "twitter:image" };
return FindFirstResult(tags);
}
public string GetUrl()
{
string[] tags = new string[] { "url", "og:url", "twitter:url" };
return FindFirstResult(tags);
}
public string GetPageType()
{
string[] tags = new string[] { "og:type", "type" };
return FindFirstResult(tags);
}
}