Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HeadParserClient.cs

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;

namespace Newsbot.Collector.Services.HtmlParser;

public class HeadParserClient
{
    private const string XPathHeadMetaTag = "//head/meta";
    private const string XPathBodyMetaTag = "//body/meta";
    private const string XPathLinkTag = "//head/link";

    private readonly string _htmlContent;

    public HeadParserClient(string htmlContent, bool useBrowser = false)
    {
        _htmlContent = htmlContent;
        Data = new HeadParserModel();
    }

    public HeadParserModel Data { get; set; }

    public void Parse()
    {
        Data.Title = GetMetaTitle();
        Data.Description = GetMetaDescription();
        Data.Image = GetMetaImage();
        Data.Url = GetMetaUrl();
        Data.PageType = GetMetaPageType();
        Data.ColorTheme = GetMetaColorTheme();

        Data.FeedUri = GetSiteFeed();
        Data.YoutubeChannelID = GetYouTubeChannelId();
    }

    private List<HtmlNode> CollectMetaTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);

        var allTags = new List<HtmlNode>();

        var headerTags = htmlDoc.DocumentNode.SelectNodes(XPathHeadMetaTag).ToList();
        allTags.AddRange(headerTags);

        try
        {
            var bodyTags = htmlDoc.DocumentNode.SelectNodes(XPathBodyMetaTag).ToList();
            allTags.AddRange(bodyTags);
        }
        catch
        {
            // no tags found in the body and that's ok.
            // we check the body thanks to Youtube.
        }

        return allTags;
    }

    private List<HtmlNode> CollectLinkTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);
        var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();
        return links;
    }

    private string GetTagValue(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            if (meta.Attributes.Count == 0) continue;
            ;
            //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
            if (meta.Attributes[0].Value.Contains(Tag) == false) continue;
            return meta.Attributes[1].Value;
        }

        return "";
    }

    private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
        {
            var res = GetTagValue(tag, htmlTags);
            if (res == "") continue;
            return res;
        }

        return "";
    }

    public string GetMetaTitle()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "twitter:title", "og:title", "title" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaDescription()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "twitter:description", "og:description", "description" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaImage()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "twitter:image", "og:image", "image" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaUrl()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "twitter:url", "og:url", "url" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaPageType()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "og:type", "type" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaColorTheme()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "theme-color" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetYouTubeChannelId()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = { "og:url", "channelId" };
        var results = FindFirstResult(tags, htmlTags);
        var id = results.Replace("https://www.youtube.com/channel/", "");
        return id;
    }

    /// <summary>
    ///     This will parse the headers looking for known keys that will contain a RSS feed link.
    ///     If the feed is not found, this will throw an exception (MissingHeaderValueException).
    /// </summary>
    /// <returns></returns>
    public string GetSiteFeed()
    {
        var htmlTags = CollectLinkTags();
        var tags = new[] { "alternate" };
        try
        {
            var attr = FindFirstAttribute(tags, htmlTags);
            foreach (var item in attr)
            {
                if (item.Name != "href") continue;

                var uri = item.Value;
                if (uri.StartsWith("//")) uri = uri.Replace("//", "https://");

                return uri;
            }

            return "";
        }
        catch
        {
            // not found
            return "";
        }
    }

    private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
            try
            {
                var res = GetValidAttribute(tag, htmlTags);
                return res;
            }
            catch (MissingHeaderValueException)
            {
                // Nothing was found in the given tag but we will keep looking till we finish all the entries.
            }

        throw new MissingHeaderValueException("Unable to find the requested value");
    }

    private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            if (meta.Attributes[0].Value.Contains(Tag) == false) continue;
            return meta.Attributes;
        }

        throw new MissingHeaderValueException("Site does not expose requested tag.");
    }
}