Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HeadParserClient.cs

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;

namespace Newsbot.Collector.Services.HtmlParser;

public class HeadParserClient
{
    private const string XPathMetaTag = "//head/meta";
    private const string XPathLinkTag = "//head/link";

    public HeadParserModel Data { get; set; }

    private string _htmlContent;

    public HeadParserClient(string htmlContent, bool useBrowser = false)
    {
        _htmlContent = htmlContent;
        Data = new HeadParserModel();
    }

    public void Parse()
    {
        Data.Title = GetMetaTitle();
        Data.Description = GetMetaDescription();
        Data.Image = GetMetaImage();
        Data.Url = GetMetaUrl();
        Data.PageType = GetMetaPageType();
        Data.ColorTheme = GetMetaColorTheme();

        Data.FeedUri = GetSiteFeed();
        Data.YoutubeChannelID = GetYouTubeChannelId();
    }

    private List<HtmlNode> CollectMetaTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);

        var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();

        return tags;
    }

    private List<HtmlNode> CollectLinkTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);
        var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();
        return links;
    }

    private string GetTagValue(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
            if (meta.Attributes[0].Value.Contains(Tag) == false)
            {
                continue;
            }
            return meta.Attributes[1].Value;
        }
        return "";
    }

    private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
        {
            var res = GetTagValue(tag, htmlTags);
            if (res == "")
            {
                continue;
            }
            return res;
        }
        return "";
    }

    public string GetMetaTitle()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:title", "og:title", "title" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaDescription()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:description", "og:description", "description" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaImage()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:image", "og:image", "image" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaUrl()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:url", "og:url", "url" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaPageType()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "og:type", "type" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaColorTheme()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "theme-color" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetYouTubeChannelId()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "channelId" };
        return FindFirstResult(tags, htmlTags);
    }

    /// <summary>
    /// This will parse the headers looking for known keys that will contain a RSS feed link.
    /// If the feed is not found, this will throw an exception (MissingHeaderValueException).
    /// </summary>
    /// <returns></returns>
    public string GetSiteFeed()
    {
        var htmlTags = CollectLinkTags();
        var tags = new string[] { "alternate" };
        try
        {
            var attr = FindFirstAttribute(tags, htmlTags);
            foreach (var item in attr)
            {
                if (item.Name != "href")
                {
                    continue;
                }

                var uri = item.Value;
                if (uri.StartsWith("//"))
                {
                    uri = uri.Replace("//", "https://");
                }

                return uri;
            }
            return "";
        }
        catch
        {
            // not found
            return "";
        }
    }

    private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
        {
            try
            {
                var res = GetValidAttribute(tag, htmlTags);
                return res;
            }
            catch (MissingHeaderValueException)
            {
                // Nothing was found in the given tag but we will keep looking till we finish all the entries.
            }
        }
        throw new MissingHeaderValueException("Unable to find the requested value");
    }

    private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            if (meta.Attributes[0].Value.Contains(Tag) == false)
            {
                continue;
            }
            return meta.Attributes;
        }
        throw new MissingHeaderValueException("Site does not expose requested tag.");
    }
}