using HtmlAgilityPack; using Newsbot.Collector.Domain.Exceptions; namespace Newsbot.Collector.Services.HtmlParser; public class HtmlPageReader { public HtmlData Data { get; set; } private HeadParserClient _headClient; private string _siteContent; public HtmlPageReader(string pageUrl) { _siteContent = ReadSiteContent(pageUrl); _headClient = new HeadParserClient(_siteContent); Data = new HtmlData(); } public void Parse() { _headClient.Parse(); Data.Header = _headClient.Data; } private string ReadSiteContent(string url) { using var client = new HttpClient(); var html = client.GetStringAsync(url); html.Wait(); var content = html.Result.ToString(); return content; } public string GetSiteContent() { return _siteContent; } public List CollectPostContent() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_siteContent); var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList(); if (links.Count == 0) { throw new Exception("Unable to parse body. Tag is unkown."); } if (links.Count >= 2) { throw new Exception("Too many results back for the body"); } var content = new List(); foreach (var item in links[0].ChildNodes) { if (item.Name == "p") { content.Add(item.InnerText); } } return links; } }