Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HeadParserClient.cs

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;

namespace Newsbot.Collector.Services.HtmlParser;

public class HeadParserClient
{
    private const string XPathMetaTag = "//head/meta";
    private const string XPathLinkTag = "//head/link";

    public HeadParserModel Data { get; set; }

    private string _htmlContent;

    public HeadParserClient(string htmlContent)
    {
        _htmlContent = htmlContent;
        Data = new HeadParserModel();
    }

    public void Parse()
    {
        Data.Title = GetMetaTitle();
        Data.Description = GetMetaDescription();
        Data.Image = GetMetaImage();
        Data.Url = GetMetaUrl();
        Data.PageType = GetMetaPageType();
        Data.ColorTheme = GetMetaColorTheme();

        Data.FeedUri = GetSiteFeed();
    }

    private List<HtmlNode> CollectMetaTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);

        var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();

        return tags;
    }

    private List<HtmlNode> CollectLinkTags()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_htmlContent);
        var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();
        return links;
    }

    private string GetTagValue(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
            if (meta.Attributes[0].Value.Contains(Tag) == false)
            {
                continue;
            }
            return meta.Attributes[1].Value;
        }
        return "";
    }

    private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
        {
            var res = GetTagValue(tag, htmlTags);
            if (res == "")
            {
                continue;
            }
            return res;
        }
        return "";
    }

    public string GetMetaTitle()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:title", "og:title", "title" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaDescription()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:description", "og:description", "description" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaImage()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:image", "og:image", "image"  };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaUrl()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "twitter:url", "og:url", "url" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaPageType()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "og:type", "type" };
        return FindFirstResult(tags, htmlTags);
    }

    public string GetMetaColorTheme()
    {
        var htmlTags = CollectMetaTags();
        string[] tags = new string[] { "theme-color" };
        return FindFirstResult(tags, htmlTags);
    }

    /// <summary>
    /// This will parse the headers looking for known keys that will contain a RSS feed link.
    /// If the feed is not found, this will throw an exception (MissingHeaderValueException).
    /// </summary>
    /// <returns></returns>
    public string GetSiteFeed()
    {
        var htmlTags = CollectLinkTags();
        var tags = new string[] { "alternate" };
        try
        {
            var attr = FindFirstAttribute(tags, htmlTags);
            foreach (var item in attr)
            {
                if (item.Name != "href")
                {
                    continue;
                }

                var uri = item.Value;
                if (uri.StartsWith("//"))
                {
                    uri = uri.Replace("//", "https://");
                }

                return uri;
            }
            return "";
        }
        catch
        {
            // not found
            return "";
        }
    }

    private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)
    {
        foreach (var tag in tags)
        {
            try
            {
                var res = GetValidAttribute(tag, htmlTags);
                return res;
            }
            catch (MissingHeaderValueException)
            {
                // Nothing was found in the given tag but we will keep looking till we finish all the entries.
            }
        }
        throw new MissingHeaderValueException("Unable to find the requested value");
    }

    private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)
    {
        foreach (var meta in html)
        {
            if (meta.Attributes[0].Value.Contains(Tag) == false)
            {
                continue;
            }
            return meta.Attributes;
        }
        throw new MissingHeaderValueException("Site does not expose requested tag.");
    }
}
Features/more rss improvements (#6) * exposing connectionStrings to controllers * First controller added to start testing * corrected param to be page not age * new model to map connection strings to for the controllers * HelloWorldJob uses options now to make hangfire happy * improved the html reader to find some rss feeds and start to extract the body of the content * moved html parser to its own namespace and make a sub client to process theh header * helpful vsc changes * updated rss watcher to include the sourceId so it can be added to the db call * updated tests to reflect changes * updated gitignore to avoid trash and moved over my makefile * More routes and added serilog * adding more database calls for the controllers * Updated interfaces for the tables * Added Serilog to jobs * removed default files * Added more routes and added DTO * Added DTO objects and SourceType Consts for easy usage * updated discord model name to follow the pattern * updated formatting * new dto objects and Subscriptions repo interface * added subscription db and api calls * focusing on the twitter tags as most sites focus on them * updated test to pull a html based feed 2023-02-26 09:40:04 -08:00			`using HtmlAgilityPack;`
			`using Newsbot.Collector.Domain.Exceptions;`

			`namespace Newsbot.Collector.Services.HtmlParser;`

			`public class HeadParserClient`
			`{`
			`private const string XPathMetaTag = "//head/meta";`
			`private const string XPathLinkTag = "//head/link";`

			`public HeadParserModel Data { get; set; }`

			`private string _htmlContent;`

			`public HeadParserClient(string htmlContent)`
			`{`
			`_htmlContent = htmlContent;`
			`Data = new HeadParserModel();`
			`}`

			`public void Parse()`
			`{`
			`Data.Title = GetMetaTitle();`
			`Data.Description = GetMetaDescription();`
			`Data.Image = GetMetaImage();`
			`Data.Url = GetMetaUrl();`
			`Data.PageType = GetMetaPageType();`
			`Data.ColorTheme = GetMetaColorTheme();`

			`Data.FeedUri = GetSiteFeed();`
			`}`

			`private List<HtmlNode> CollectMetaTags()`
			`{`
			`var htmlDoc = new HtmlDocument();`
			`htmlDoc.LoadHtml(_htmlContent);`

			`var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();`

			`return tags;`
			`}`

			`private List<HtmlNode> CollectLinkTags()`
			`{`
			`var htmlDoc = new HtmlDocument();`
			`htmlDoc.LoadHtml(_htmlContent);`
			`var links = htmlDoc.DocumentNode.SelectNodes(XPathLinkTag).ToList();`
			`return links;`
			`}`

			`private string GetTagValue(string Tag, List<HtmlNode> html)`
			`{`
			`foreach (var meta in html)`
			`{`
			`//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");`
			`if (meta.Attributes[0].Value.Contains(Tag) == false)`
			`{`
			`continue;`
			`}`
			`return meta.Attributes[1].Value;`
			`}`
			`return "";`
			`}`

			`private string FindFirstResult(string[] tags, List<HtmlNode> htmlTags)`
			`{`
			`foreach (var tag in tags)`
			`{`
			`var res = GetTagValue(tag, htmlTags);`
			`if (res == "")`
			`{`
			`continue;`
			`}`
			`return res;`
			`}`
			`return "";`
			`}`

			`public string GetMetaTitle()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "twitter:title", "og:title", "title" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`public string GetMetaDescription()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "twitter:description", "og:description", "description" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`public string GetMetaImage()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "twitter:image", "og:image", "image" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`public string GetMetaUrl()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "twitter:url", "og:url", "url" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`public string GetMetaPageType()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "og:type", "type" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`public string GetMetaColorTheme()`
			`{`
			`var htmlTags = CollectMetaTags();`
			`string[] tags = new string[] { "theme-color" };`
			`return FindFirstResult(tags, htmlTags);`
			`}`

			`/// <summary>`
			`/// This will parse the headers looking for known keys that will contain a RSS feed link.`
			`/// If the feed is not found, this will throw an exception (MissingHeaderValueException).`
			`/// </summary>`
			`/// <returns></returns>`
			`public string GetSiteFeed()`
			`{`
			`var htmlTags = CollectLinkTags();`
			`var tags = new string[] { "alternate" };`
			`try`
			`{`
			`var attr = FindFirstAttribute(tags, htmlTags);`
			`foreach (var item in attr)`
			`{`
			`if (item.Name != "href")`
			`{`
			`continue;`
			`}`

			`var uri = item.Value;`
			`if (uri.StartsWith("//"))`
			`{`
			`uri = uri.Replace("//", "https://");`
			`}`

			`return uri;`
			`}`
			`return "";`
			`}`
			`catch`
			`{`
			`// not found`
			`return "";`
			`}`
			`}`

			`private HtmlAttributeCollection FindFirstAttribute(string[] tags, List<HtmlNode> htmlTags)`
			`{`
			`foreach (var tag in tags)`
			`{`
			`try`
			`{`
			`var res = GetValidAttribute(tag, htmlTags);`
			`return res;`
			`}`
			`catch (MissingHeaderValueException)`
			`{`
			`// Nothing was found in the given tag but we will keep looking till we finish all the entries.`
			`}`
			`}`
			`throw new MissingHeaderValueException("Unable to find the requested value");`
			`}`

			`private HtmlAttributeCollection GetValidAttribute(string Tag, List<HtmlNode> html)`
			`{`
			`foreach (var meta in html)`
			`{`
			`if (meta.Attributes[0].Value.Contains(Tag) == false)`
			`{`
			`continue;`
			`}`
			`return meta.Attributes;`
			`}`
			`throw new MissingHeaderValueException("Site does not expose requested tag.");`
			`}`
			`}`