using HtmlAgilityPack; using Newsbot.Collector.Domain.Exceptions; namespace Newsbot.Collector.Services.HtmlParser; public class HeadParserClient { private const string XPathHeadMetaTag = "//head/meta"; private const string XPathBodyMetaTag = "//body/meta"; private const string XPathLinkTag = "//head/link"; private readonly string _htmlContent; public HeadParserClient(string htmlContent, bool useBrowser = false) { _htmlContent = htmlContent; Data = new HeadParserModel(); } public HeadParserModel Data { get; set; } public void Parse() { Data.Title = GetMetaTitle(); Data.Description = GetMetaDescription(); Data.Image = GetMetaImage(); Data.Url = GetMetaUrl(); Data.PageType = GetMetaPageType(); Data.ColorTheme = GetMetaColorTheme(); Data.FeedUri = GetSiteFeed(); Data.YoutubeChannelID = GetYouTubeChannelId(); } private List CollectMetaTags() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_htmlContent); var allTags = new List(); var headerTags = htmlDoc.DocumentNode.SelectNodes(XPathHeadMetaTag).ToList(); allTags.AddRange(headerTags); try { var bodyTags = htmlDoc.DocumentNode.SelectNodes(XPathBodyMetaTag).ToList(); allTags.AddRange(bodyTags); } catch { // no tags found in the body and that's ok. // we check the body thanks to Youtube. } return allTags; } private List CollectLinkTags() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_htmlContent); var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList(); return links; } private string GetTagValue(string Tag, List html) { foreach (var meta in html) { if (meta.Attributes.Count == 0) continue; ; //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}"); if (meta.Attributes[0].Value.Contains(Tag) == false) continue; return meta.Attributes[1].Value; } return ""; } private string FindFirstResult(string[] tags, List htmlTags) { foreach (var tag in tags) { var res = GetTagValue(tag, htmlTags); if (res == "") continue; return res; } return ""; } public string GetMetaTitle() { var htmlTags = CollectMetaTags(); string[] tags = { "twitter:title", "og:title", "title" }; return FindFirstResult(tags, htmlTags); } public string GetMetaDescription() { var htmlTags = CollectMetaTags(); string[] tags = { "twitter:description", "og:description", "description" }; return FindFirstResult(tags, htmlTags); } public string GetMetaImage() { var htmlTags = CollectMetaTags(); string[] tags = { "twitter:image", "og:image", "image" }; return FindFirstResult(tags, htmlTags); } public string GetMetaUrl() { var htmlTags = CollectMetaTags(); string[] tags = { "twitter:url", "og:url", "url" }; return FindFirstResult(tags, htmlTags); } public string GetMetaPageType() { var htmlTags = CollectMetaTags(); string[] tags = { "og:type", "type" }; return FindFirstResult(tags, htmlTags); } public string GetMetaColorTheme() { var htmlTags = CollectMetaTags(); string[] tags = { "theme-color" }; return FindFirstResult(tags, htmlTags); } public string GetYouTubeChannelId() { var htmlTags = CollectMetaTags(); string[] tags = { "og:url", "channelId" }; var results = FindFirstResult(tags, htmlTags); var id = results.Replace("https://www.youtube.com/channel/", ""); return id; } /// /// This will parse the headers looking for known keys that will contain a RSS feed link. /// If the feed is not found, this will throw an exception (MissingHeaderValueException). /// /// public string GetSiteFeed() { var htmlTags = CollectLinkTags(); var tags = new[] { "alternate" }; try { var attr = FindFirstAttribute(tags, htmlTags); foreach (var item in attr) { if (item.Name != "href") continue; var uri = item.Value; if (uri.StartsWith("//")) uri = uri.Replace("//", "https://"); return uri; } return ""; } catch { // not found return ""; } } private HtmlAttributeCollection FindFirstAttribute(string[] tags, List htmlTags) { foreach (var tag in tags) try { var res = GetValidAttribute(tag, htmlTags); return res; } catch (MissingHeaderValueException) { // Nothing was found in the given tag but we will keep looking till we finish all the entries. } throw new MissingHeaderValueException("Unable to find the requested value"); } private HtmlAttributeCollection GetValidAttribute(string Tag, List html) { foreach (var meta in html) { if (meta.Attributes[0].Value.Contains(Tag) == false) continue; return meta.Attributes; } throw new MissingHeaderValueException("Site does not expose requested tag."); } }