Newsbot.Collector/Newsbot.Collector.Services/HtmlParser/HtmlPageReader.cs
James Tombleson 9be985da0a
Features/adding youtube (#13)
* Found the meta tags on youtube... in the body and updated the client to pull them out.

* Updated namespace on test

* I think formatting cleaned this up

* Seed migrations have been cleaned up to get my configs out and moving them to a script.

* Updates to the ISourcesRepository.cs to allow for new calls to the db.

* formatter

* Db models updated. Icon now can track sourceID and source can have a youtube id.

* Updated api logger to ignore otel if no connection string given.

* updated docker init so I can run migrations from the image

* seed was updated to reflect the new api changes

* Updated the SourcesController.cs to grab icon data.

* Added reddit const values

* Minor changes to HtmlPageReader.cs

* Jobs are now pulling in the config section to bundle values.

* Removed youtube api, not needed anymore.

* test updates
2023-03-31 22:49:39 -07:00

68 lines
1.8 KiB
C#

using HtmlAgilityPack;
namespace Newsbot.Collector.Services.HtmlParser;
public class HtmlPageReaderOptions
{
public string? Url { get; init; }
public string? SourceCode { get; init; }
}
public class HtmlPageReader
{
private readonly HeadParserClient _headClient;
private readonly string _siteContent;
public HtmlPageReader(HtmlPageReaderOptions options)
{
if (options.SourceCode is not null) _siteContent = options.SourceCode;
if (options.Url is not null) _siteContent = ReadSiteContent(options.Url);
if (_siteContent is null) throw new Exception("SiteContent was not filled and expected.");
_headClient = new HeadParserClient(_siteContent);
Data = new HtmlData();
}
public HtmlData Data { get; set; }
public void Parse()
{
_headClient.Parse();
Data.Header = _headClient.Data;
}
private string ReadSiteContent(string url)
{
using var client = new HttpClient();
var html = client.GetStringAsync(url);
html.Wait();
var content = html.Result;
return content;
}
public string GetSiteContent()
{
return _siteContent;
}
public List<HtmlNode> CollectPostContent()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_siteContent);
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
if (links.Count == 0) throw new Exception("Unable to parse body. Tag is unknown.");
if (links.Count >= 2) throw new Exception("Too many results back for the body");
//var content = new List<string>();
//foreach (var item in links[0].ChildNodes)
// if (item.Name == "p")
// content.Add(item.InnerText);
return links;
}
}