James Tombleson
9be985da0a
* Found the meta tags on youtube... in the body and updated the client to pull them out. * Updated namespace on test * I think formatting cleaned this up * Seed migrations have been cleaned up to get my configs out and moving them to a script. * Updates to the ISourcesRepository.cs to allow for new calls to the db. * formatter * Db models updated. Icon now can track sourceID and source can have a youtube id. * Updated api logger to ignore otel if no connection string given. * updated docker init so I can run migrations from the image * seed was updated to reflect the new api changes * Updated the SourcesController.cs to grab icon data. * Added reddit const values * Minor changes to HtmlPageReader.cs * Jobs are now pulling in the config section to bundle values. * Removed youtube api, not needed anymore. * test updates
68 lines
1.8 KiB
C#
68 lines
1.8 KiB
C#
using HtmlAgilityPack;
|
|
|
|
namespace Newsbot.Collector.Services.HtmlParser;
|
|
|
|
public class HtmlPageReaderOptions
|
|
{
|
|
public string? Url { get; init; }
|
|
public string? SourceCode { get; init; }
|
|
}
|
|
|
|
public class HtmlPageReader
|
|
{
|
|
private readonly HeadParserClient _headClient;
|
|
private readonly string _siteContent;
|
|
|
|
public HtmlPageReader(HtmlPageReaderOptions options)
|
|
{
|
|
if (options.SourceCode is not null) _siteContent = options.SourceCode;
|
|
|
|
if (options.Url is not null) _siteContent = ReadSiteContent(options.Url);
|
|
|
|
if (_siteContent is null) throw new Exception("SiteContent was not filled and expected.");
|
|
|
|
_headClient = new HeadParserClient(_siteContent);
|
|
Data = new HtmlData();
|
|
}
|
|
|
|
public HtmlData Data { get; set; }
|
|
|
|
public void Parse()
|
|
{
|
|
_headClient.Parse();
|
|
Data.Header = _headClient.Data;
|
|
}
|
|
|
|
private string ReadSiteContent(string url)
|
|
{
|
|
using var client = new HttpClient();
|
|
var html = client.GetStringAsync(url);
|
|
html.Wait();
|
|
|
|
var content = html.Result;
|
|
return content;
|
|
}
|
|
|
|
public string GetSiteContent()
|
|
{
|
|
return _siteContent;
|
|
}
|
|
|
|
public List<HtmlNode> CollectPostContent()
|
|
{
|
|
var htmlDoc = new HtmlDocument();
|
|
htmlDoc.LoadHtml(_siteContent);
|
|
var links = htmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'article-text')]").ToList();
|
|
|
|
if (links.Count == 0) throw new Exception("Unable to parse body. Tag is unknown.");
|
|
|
|
if (links.Count >= 2) throw new Exception("Too many results back for the body");
|
|
|
|
//var content = new List<string>();
|
|
//foreach (var item in links[0].ChildNodes)
|
|
// if (item.Name == "p")
|
|
// content.Add(item.InnerText);
|
|
|
|
return links;
|
|
}
|
|
} |