Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HtmlPageReader.cs

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;

namespace Newsbot.Collector.Services.HtmlParser;

public class HtmlPageReader
{

    public HtmlData Data { get; set; }

    private HeadParserClient _headClient;

    private string _siteContent;

    public HtmlPageReader(string pageUrl)
    {
        _siteContent = ReadSiteContent(pageUrl);
        _headClient = new HeadParserClient(_siteContent);

        Data = new HtmlData();
    }

    public void Parse()
    {
        _headClient.Parse();
        Data.Header = _headClient.Data;
    }

    private string ReadSiteContent(string url)
    {
        using var client = new HttpClient();
        var html = client.GetStringAsync(url);
        html.Wait();

        var content = html.Result.ToString();
        return content;
    }

    public string GetSiteContent()
    {
        return _siteContent;
    }

    public List<HtmlNode> CollectPostContent()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_siteContent);
        var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();

        if (links.Count == 0)
        {
            throw new Exception("Unable to parse body.  Tag is unkown.");
        }

        if (links.Count >= 2)
        {
            throw new Exception("Too many results back for the body");
        }

        var content = new List<string>();
        foreach (var item in links[0].ChildNodes)
        {
            if (item.Name == "p")
            {
                content.Add(item.InnerText);
            }
        }

        return links;
    }
}
Features/more rss improvements (#6) * exposing connectionStrings to controllers * First controller added to start testing * corrected param to be page not age * new model to map connection strings to for the controllers * HelloWorldJob uses options now to make hangfire happy * improved the html reader to find some rss feeds and start to extract the body of the content * moved html parser to its own namespace and make a sub client to process theh header * helpful vsc changes * updated rss watcher to include the sourceId so it can be added to the db call * updated tests to reflect changes * updated gitignore to avoid trash and moved over my makefile * More routes and added serilog * adding more database calls for the controllers * Updated interfaces for the tables * Added Serilog to jobs * removed default files * Added more routes and added DTO * Added DTO objects and SourceType Consts for easy usage * updated discord model name to follow the pattern * updated formatting * new dto objects and Subscriptions repo interface * added subscription db and api calls * focusing on the twitter tags as most sites focus on them * updated test to pull a html based feed 2023-02-26 09:40:04 -08:00			`using HtmlAgilityPack;`
			`using Newsbot.Collector.Domain.Exceptions;`

			`namespace Newsbot.Collector.Services.HtmlParser;`

			`public class HtmlPageReader`
			`{`

			`public HtmlData Data { get; set; }`

			`private HeadParserClient _headClient;`

			`private string _siteContent;`

			`public HtmlPageReader(string pageUrl)`
			`{`
			`_siteContent = ReadSiteContent(pageUrl);`
			`_headClient = new HeadParserClient(_siteContent);`

			`Data = new HtmlData();`
			`}`

			`public void Parse()`
			`{`
			`_headClient.Parse();`
			`Data.Header = _headClient.Data;`
			`}`

			`private string ReadSiteContent(string url)`
			`{`
			`using var client = new HttpClient();`
			`var html = client.GetStringAsync(url);`
			`html.Wait();`

			`var content = html.Result.ToString();`
			`return content;`
			`}`

			`public string GetSiteContent()`
			`{`
			`return _siteContent;`
			`}`

			`public List<HtmlNode> CollectPostContent()`
			`{`
			`var htmlDoc = new HtmlDocument();`
			`htmlDoc.LoadHtml(_siteContent);`
			`var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();`

			`if (links.Count == 0)`
			`{`
			`throw new Exception("Unable to parse body. Tag is unkown.");`
			`}`

			`if (links.Count >= 2)`
			`{`
			`throw new Exception("Too many results back for the body");`
			`}`

			`var content = new List<string>();`
			`foreach (var item in links[0].ChildNodes)`
			`{`
			`if (item.Name == "p")`
			`{`
			`content.Add(item.InnerText);`
			`}`
			`}`

			`return links;`
			`}`
			`}`