using HtmlAgilityPack; namespace Newsbot.Collector.Services.HtmlParser; public class HtmlPageReaderOptions { public string? Url { get; init; } public string? SourceCode { get; init; } } public class HtmlPageReader { private readonly HeadParserClient _headClient; private readonly string _siteContent; public HtmlPageReader(HtmlPageReaderOptions options) { if (options.SourceCode is not null) _siteContent = options.SourceCode; if (options.Url is not null) _siteContent = ReadSiteContent(options.Url); if (_siteContent is null) throw new Exception("SiteContent was not filled and expected."); _headClient = new HeadParserClient(_siteContent); Data = new HtmlData(); } public HtmlData Data { get; set; } public void Parse() { _headClient.Parse(); Data.Header = _headClient.Data; } private string ReadSiteContent(string url) { using var client = new HttpClient(); var html = client.GetStringAsync(url); html.Wait(); var content = html.Result; return content; } public string GetSiteContent() { return _siteContent; } public List CollectPostContent() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_siteContent); var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList(); if (links.Count == 0) throw new Exception("Unable to parse body. Tag is unknown."); if (links.Count >= 2) throw new Exception("Too many results back for the body"); //var content = new List(); //foreach (var item in links[0].ChildNodes) // if (item.Name == "p") // content.Add(item.InnerText); return links; } }