Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HtmlPageReader.cs
James Tombleson 521940ca4f
Features/more rss improvements (#6)
* exposing connectionStrings to controllers

* First controller added to start testing

* corrected param to be page not age

* new model to map connection strings to for the controllers

* HelloWorldJob uses options now to make hangfire happy

* improved the html reader to find some rss feeds and start to extract the body of the content

* moved html parser to its own namespace and make a sub client to process theh header

* helpful vsc changes

* updated rss watcher to include the sourceId so it can be added to the db call

* updated tests to reflect changes

* updated gitignore to avoid trash and moved over my makefile

* More routes and added serilog

* adding more database calls for the controllers

* Updated interfaces for the tables

* Added Serilog to jobs

* removed default files

* Added more routes and added DTO

* Added DTO objects and SourceType Consts for easy usage

* updated discord model name to follow the pattern

* updated formatting

* new dto objects and Subscriptions repo interface

* added subscription db and api calls

* focusing on the twitter tags as most sites focus on them

* updated test to pull a html based feed
2023-02-26 09:40:04 -08:00

71 lines
1.6 KiB
C#

using HtmlAgilityPack;
using Newsbot.Collector.Domain.Exceptions;
namespace Newsbot.Collector.Services.HtmlParser;
public class HtmlPageReader
{
public HtmlData Data { get; set; }
private HeadParserClient _headClient;
private string _siteContent;
public HtmlPageReader(string pageUrl)
{
_siteContent = ReadSiteContent(pageUrl);
_headClient = new HeadParserClient(_siteContent);
Data = new HtmlData();
}
public void Parse()
{
_headClient.Parse();
Data.Header = _headClient.Data;
}
private string ReadSiteContent(string url)
{
using var client = new HttpClient();
var html = client.GetStringAsync(url);
html.Wait();
var content = html.Result.ToString();
return content;
}
public string GetSiteContent()
{
return _siteContent;
}
public List<HtmlNode> CollectPostContent()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_siteContent);
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
if (links.Count == 0)
{
throw new Exception("Unable to parse body. Tag is unkown.");
}
if (links.Count >= 2)
{
throw new Exception("Too many results back for the body");
}
var content = new List<string>();
foreach (var item in links[0].ChildNodes)
{
if (item.Name == "p")
{
content.Add(item.InnerText);
}
}
return links;
}
}