2023-02-26 09:40:04 -08:00
|
|
|
using HtmlAgilityPack;
|
|
|
|
|
|
|
|
namespace Newsbot.Collector.Services.HtmlParser;
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
public class HtmlPageReaderOptions
|
|
|
|
{
|
|
|
|
public string? Url { get; init; }
|
|
|
|
public string? SourceCode { get; init; }
|
|
|
|
}
|
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
public class HtmlPageReader
|
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
private readonly HeadParserClient _headClient;
|
|
|
|
private readonly string _siteContent;
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
public HtmlPageReader(HtmlPageReaderOptions options)
|
|
|
|
{
|
|
|
|
if (options.SourceCode is not null) _siteContent = options.SourceCode;
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
if (options.Url is not null) _siteContent = ReadSiteContent(options.Url);
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
if (_siteContent is null) throw new Exception("SiteContent was not filled and expected.");
|
2023-02-26 09:40:04 -08:00
|
|
|
|
|
|
|
_headClient = new HeadParserClient(_siteContent);
|
|
|
|
Data = new HtmlData();
|
|
|
|
}
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
public HtmlData Data { get; set; }
|
|
|
|
|
2023-02-26 09:40:04 -08:00
|
|
|
public void Parse()
|
|
|
|
{
|
|
|
|
_headClient.Parse();
|
|
|
|
Data.Header = _headClient.Data;
|
|
|
|
}
|
|
|
|
|
|
|
|
private string ReadSiteContent(string url)
|
|
|
|
{
|
|
|
|
using var client = new HttpClient();
|
|
|
|
var html = client.GetStringAsync(url);
|
|
|
|
html.Wait();
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
var content = html.Result;
|
2023-02-26 09:40:04 -08:00
|
|
|
return content;
|
|
|
|
}
|
|
|
|
|
|
|
|
public string GetSiteContent()
|
|
|
|
{
|
|
|
|
return _siteContent;
|
|
|
|
}
|
|
|
|
|
|
|
|
public List<HtmlNode> CollectPostContent()
|
|
|
|
{
|
|
|
|
var htmlDoc = new HtmlDocument();
|
|
|
|
htmlDoc.LoadHtml(_siteContent);
|
|
|
|
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
if (links.Count == 0) throw new Exception("Unable to parse body. Tag is unknown.");
|
|
|
|
|
|
|
|
if (links.Count >= 2) throw new Exception("Too many results back for the body");
|
|
|
|
|
|
|
|
//var content = new List<string>();
|
|
|
|
//foreach (var item in links[0].ChildNodes)
|
|
|
|
// if (item.Name == "p")
|
|
|
|
// content.Add(item.InnerText);
|
2023-02-26 09:40:04 -08:00
|
|
|
|
|
|
|
return links;
|
|
|
|
}
|
|
|
|
}
|