Newsbot.Collector/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs
James Tombleson 521940ca4f
Features/more rss improvements (#6)
* exposing connectionStrings to controllers

* First controller added to start testing

* corrected param to be page not age

* new model to map connection strings to for the controllers

* HelloWorldJob uses options now to make hangfire happy

* improved the html reader to find some rss feeds and start to extract the body of the content

* moved html parser to its own namespace and make a sub client to process theh header

* helpful vsc changes

* updated rss watcher to include the sourceId so it can be added to the db call

* updated tests to reflect changes

* updated gitignore to avoid trash and moved over my makefile

* More routes and added serilog

* adding more database calls for the controllers

* Updated interfaces for the tables

* Added Serilog to jobs

* removed default files

* Added more routes and added DTO

* Added DTO objects and SourceType Consts for easy usage

* updated discord model name to follow the pattern

* updated formatting

* new dto objects and Subscriptions repo interface

* added subscription db and api calls

* focusing on the twitter tags as most sites focus on them

* updated test to pull a html based feed
2023-02-26 09:40:04 -08:00

166 lines
4.7 KiB
C#

using System.ServiceModel.Syndication;
using System.Xml;
using Microsoft.Extensions.Configuration;
using Newsbot.Collector.Database.Repositories;
using Newsbot.Collector.Domain.Consts;
using Newsbot.Collector.Domain.Interfaces;
using Newsbot.Collector.Domain.Models;
using Newsbot.Collector.Services.HtmlParser;
using Serilog;
namespace Newsbot.Collector.Services.Jobs;
public class RssWatcherJobOptions
{
public string ConnectionString { get; set; } = "";
}
// This class was made to work with Hangfire and it does not support constructors.
public class RssWatcherJob : IHangfireJob
{
private IArticlesRepository _articles;
private IDiscordQueueRepository _queue;
private ISourcesRepository _source;
public RssWatcherJob()
{
_articles = new ArticlesTable("");
_queue = new DiscordQueueTable("");
_source = new SourcesTable("");
}
public void InitAndExecute(RssWatcherJobOptions options)
{
Log.Information("RssWatcherJob - Job was triggered");
Log.Information("RssWatcherJob - Setting up the job");
Init(options.ConnectionString);
var articles = new List<ArticlesModel>();
Log.Information("RssWatcherJob - Requesting sources");
var sources = _source.ListByType(SourceTypes.Rss);
Log.Information($"RssWatcherJob - Got {sources.Count()} back");
foreach (var source in sources)
{
Log.Information($"RssWatcherJob - Starting to proces '{source.Name}'");
Log.Information("RssWatcherJob - Starting to request feed to be processed");
var results = Collect(source.Url, source.ID);
Log.Information($"RssWatcherJob - Collected {results.Count()} posts");
articles.AddRange(results);
}
Log.Information("RssWatcherJob - Sending posts over to the database");
UpdateDatabase(articles);
Log.Information("RssWatcherJob - Done!");
}
public void InitAndExecute(IConfiguration config)
{
// reach out to the db and find all the rss feeds
var connectionString = config.GetConnectionString("database");
if (connectionString is null)
{
connectionString = "";
}
Init(connectionString);
var articles = new List<ArticlesModel>();
var sources = _source.ListByType(SourceTypes.Rss);
foreach (var source in sources)
{
var results = Collect(source.Url, source.ID);
articles.AddRange(results);
}
UpdateDatabase(articles);
}
public void Init(string connectionString)
{
_articles = new ArticlesTable(connectionString);
_queue = new DiscordQueueTable(connectionString);
_source = new SourcesTable(connectionString);
}
public List<ArticlesModel> Collect(string url, Guid SourceID, int sleep = 3000)
{
var CollectedPosts = new List<ArticlesModel>();
using var reader = XmlReader.Create(url);
var feed = SyndicationFeed.Load(reader);
foreach (var post in feed.Items.ToList())
{
var articleUrl = post.Links[0].Uri.AbsoluteUri;
// Check if we have seen the url before
// If we have, skip and save the site bandwidth
if (IsThisUrlKnown(articleUrl) == true)
{
continue;
}
var meta = new HtmlPageReader(articleUrl);
meta.Parse();
var article = new ArticlesModel
{
Title = post.Title.Text,
Tags = FetchTags(post),
URL = articleUrl,
PubDate = post.PublishDate.DateTime,
Thumbnail = meta.Data.Header.Image,
Description = meta.Data.Header.Description,
SourceID = SourceID
};
CollectedPosts.Add(article);
// try to not be too greedy
Thread.Sleep(sleep);
}
return CollectedPosts;
}
public void UpdateDatabase(List<ArticlesModel> items)
{
foreach (var item in items)
{
if (IsThisUrlKnown(item.URL) == true)
{
continue;
}
var p = _articles.New(item);
_queue.New(new DiscordQueueModel
{
ArticleID = p.ID
});
}
}
private bool IsThisUrlKnown(string url)
{
var isKnown = _articles.GetByUrl(url);
if (isKnown.URL == url)
{
return true;
}
return false;
}
private string FetchTags(SyndicationItem post)
{
string result = "";
foreach (var tag in post.Categories)
{
result += $"{tag.Name},";
}
return result;
}
}