Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HtmlPageReader.cs

using HtmlAgilityPack;

namespace Newsbot.Collector.Services.HtmlParser;

public class HtmlPageReaderOptions
{
    public string? Url { get; init; }
    public string? SourceCode { get; init; }
}

public class HtmlPageReader
{
    private readonly HeadParserClient _headClient;
    private readonly string _siteContent;

    public HtmlPageReader(HtmlPageReaderOptions options)
    {
        if (options.SourceCode is not null) _siteContent = options.SourceCode;

        if (options.Url is not null) _siteContent = ReadSiteContent(options.Url);

        if (_siteContent is null) throw new Exception("SiteContent was not filled and expected.");

        _headClient = new HeadParserClient(_siteContent);
        Data = new HtmlData();
    }

    public HtmlData Data { get; set; }

    public void Parse()
    {
        _headClient.Parse();
        Data.Header = _headClient.Data;
    }

    private string ReadSiteContent(string url)
    {
        using var client = new HttpClient();
        var html = client.GetStringAsync(url);
        html.Wait();

        var content = html.Result;
        return content;
    }

    public string GetSiteContent()
    {
        return _siteContent;
    }

    public List<HtmlNode> CollectPostContent()
    {
        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(_siteContent);
        var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();

        if (links.Count == 0) throw new Exception("Unable to parse body.  Tag is unknown.");

        if (links.Count >= 2) throw new Exception("Too many results back for the body");

        //var content = new List<string>();
        //foreach (var item in links[0].ChildNodes)
        //    if (item.Name == "p")
        //        content.Add(item.InnerText);

        return links;
    }
}