James Tombleson
521940ca4f
* exposing connectionStrings to controllers * First controller added to start testing * corrected param to be page not age * new model to map connection strings to for the controllers * HelloWorldJob uses options now to make hangfire happy * improved the html reader to find some rss feeds and start to extract the body of the content * moved html parser to its own namespace and make a sub client to process theh header * helpful vsc changes * updated rss watcher to include the sourceId so it can be added to the db call * updated tests to reflect changes * updated gitignore to avoid trash and moved over my makefile * More routes and added serilog * adding more database calls for the controllers * Updated interfaces for the tables * Added Serilog to jobs * removed default files * Added more routes and added DTO * Added DTO objects and SourceType Consts for easy usage * updated discord model name to follow the pattern * updated formatting * new dto objects and Subscriptions repo interface * added subscription db and api calls * focusing on the twitter tags as most sites focus on them * updated test to pull a html based feed
71 lines
1.6 KiB
C#
71 lines
1.6 KiB
C#
using HtmlAgilityPack;
|
|
using Newsbot.Collector.Domain.Exceptions;
|
|
|
|
namespace Newsbot.Collector.Services.HtmlParser;
|
|
|
|
public class HtmlPageReader
|
|
{
|
|
|
|
public HtmlData Data { get; set; }
|
|
|
|
private HeadParserClient _headClient;
|
|
|
|
private string _siteContent;
|
|
|
|
public HtmlPageReader(string pageUrl)
|
|
{
|
|
_siteContent = ReadSiteContent(pageUrl);
|
|
_headClient = new HeadParserClient(_siteContent);
|
|
|
|
Data = new HtmlData();
|
|
}
|
|
|
|
public void Parse()
|
|
{
|
|
_headClient.Parse();
|
|
Data.Header = _headClient.Data;
|
|
}
|
|
|
|
private string ReadSiteContent(string url)
|
|
{
|
|
using var client = new HttpClient();
|
|
var html = client.GetStringAsync(url);
|
|
html.Wait();
|
|
|
|
var content = html.Result.ToString();
|
|
return content;
|
|
}
|
|
|
|
public string GetSiteContent()
|
|
{
|
|
return _siteContent;
|
|
}
|
|
|
|
public List<HtmlNode> CollectPostContent()
|
|
{
|
|
var htmlDoc = new HtmlDocument();
|
|
htmlDoc.LoadHtml(_siteContent);
|
|
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
|
|
|
|
if (links.Count == 0)
|
|
{
|
|
throw new Exception("Unable to parse body. Tag is unkown.");
|
|
}
|
|
|
|
if (links.Count >= 2)
|
|
{
|
|
throw new Exception("Too many results back for the body");
|
|
}
|
|
|
|
var content = new List<string>();
|
|
foreach (var item in links[0].ChildNodes)
|
|
{
|
|
if (item.Name == "p")
|
|
{
|
|
content.Add(item.InnerText);
|
|
}
|
|
}
|
|
|
|
return links;
|
|
}
|
|
} |