using System.Data; using System.Runtime.Serialization; using System.Xml; using HtmlAgilityPack; namespace Newsbot.Collector.Services; public class HtmlData { public HtmlHeaderData Header { get; set; } = new HtmlHeaderData(); } public class HtmlHeaderData { public HtmlMetaData Meta { get; set; } = new HtmlMetaData(); } public class HtmlMetaData { public string Title { get; set; } = ""; public string Description { get; set; } = ""; public string Image { get; set; } = ""; public string Url { get; set; } = ""; public string PageType { get; set; } = ""; //public string Color { get; set; } } public class HtmlPageReader { public HtmlData Data { get; set; } private const string XPathMetaTag = "//head/meta"; private string _siteContent; public HtmlPageReader(string pageUrl) { _siteContent = ReadSiteContent(pageUrl); var tags = CollectMetaTags(); Data = new HtmlData(); Data.Header.Meta.Title = GetMetaTitle(); Data.Header.Meta.Description = GetDescription(); Data.Header.Meta.Image = GetImage(); Data.Header.Meta.Url = GetUrl(); Data.Header.Meta.PageType = GetPageType(); } private string ReadSiteContent(string url) { using var client = new HttpClient(); var html = client.GetStringAsync(url); html.Wait(); var content = html.Result.ToString(); return content; } private List CollectMetaTags() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_siteContent); var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList(); return tags; } public string GetTagValue(string Tag) { var tags = CollectMetaTags(); foreach (var meta in tags) { //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}"); if (meta.Attributes[0].Value.Contains(Tag) == false) { continue; } return meta.Attributes[1].Value; } return ""; } private string FindFirstResult(string[] tags) { foreach (var tag in tags) { var res = GetTagValue(tag); if (res == "") { continue; } return res; } return ""; } public string GetMetaTitle() { string[] tags = new string[] { "og:title", "twitter:title", "title" }; return FindFirstResult(tags); } public string GetDescription() { string[] tags = new string[] { "description", "og:description" }; return FindFirstResult(tags); } public string GetImage() { string[] tags = new string[] { "image", "og:image", "twitter:image" }; return FindFirstResult(tags); } public string GetUrl() { string[] tags = new string[] { "url", "og:url", "twitter:url" }; return FindFirstResult(tags); } public string GetPageType() { string[] tags = new string[] { "og:type", "type" }; return FindFirstResult(tags); } }