71 lines
1.6 KiB
C#
71 lines
1.6 KiB
C#
|
using HtmlAgilityPack;
|
||
|
using Newsbot.Collector.Domain.Exceptions;
|
||
|
|
||
|
namespace Newsbot.Collector.Services.HtmlParser;
|
||
|
|
||
|
public class HtmlPageReader
|
||
|
{
|
||
|
|
||
|
public HtmlData Data { get; set; }
|
||
|
|
||
|
private HeadParserClient _headClient;
|
||
|
|
||
|
private string _siteContent;
|
||
|
|
||
|
public HtmlPageReader(string pageUrl)
|
||
|
{
|
||
|
_siteContent = ReadSiteContent(pageUrl);
|
||
|
_headClient = new HeadParserClient(_siteContent);
|
||
|
|
||
|
Data = new HtmlData();
|
||
|
}
|
||
|
|
||
|
public void Parse()
|
||
|
{
|
||
|
_headClient.Parse();
|
||
|
Data.Header = _headClient.Data;
|
||
|
}
|
||
|
|
||
|
private string ReadSiteContent(string url)
|
||
|
{
|
||
|
using var client = new HttpClient();
|
||
|
var html = client.GetStringAsync(url);
|
||
|
html.Wait();
|
||
|
|
||
|
var content = html.Result.ToString();
|
||
|
return content;
|
||
|
}
|
||
|
|
||
|
public string GetSiteContent()
|
||
|
{
|
||
|
return _siteContent;
|
||
|
}
|
||
|
|
||
|
public List<HtmlNode> CollectPostContent()
|
||
|
{
|
||
|
var htmlDoc = new HtmlDocument();
|
||
|
htmlDoc.LoadHtml(_siteContent);
|
||
|
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
|
||
|
|
||
|
if (links.Count == 0)
|
||
|
{
|
||
|
throw new Exception("Unable to parse body. Tag is unkown.");
|
||
|
}
|
||
|
|
||
|
if (links.Count >= 2)
|
||
|
{
|
||
|
throw new Exception("Too many results back for the body");
|
||
|
}
|
||
|
|
||
|
var content = new List<string>();
|
||
|
foreach (var item in links[0].ChildNodes)
|
||
|
{
|
||
|
if (item.Name == "p")
|
||
|
{
|
||
|
content.Add(item.InnerText);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return links;
|
||
|
}
|
||
|
}
|