using System.ServiceModel.Syndication; using System.Xml; using Microsoft.Extensions.Configuration; using Newsbot.Collector.Database.Repositories; using Newsbot.Collector.Domain.Consts; using Newsbot.Collector.Domain.Interfaces; using Newsbot.Collector.Domain.Models; using Newsbot.Collector.Services.HtmlParser; using Serilog; namespace Newsbot.Collector.Services.Jobs; public class RssWatcherJobOptions { public string ConnectionString { get; set; } = ""; } // This class was made to work with Hangfire and it does not support constructors. public class RssWatcherJob : IHangfireJob { private IArticlesRepository _articles; private IDiscordQueueRepository _queue; private ISourcesRepository _source; public RssWatcherJob() { _articles = new ArticlesTable(""); _queue = new DiscordQueueTable(""); _source = new SourcesTable(""); } public void InitAndExecute(RssWatcherJobOptions options) { Log.Information("RssWatcherJob - Job was triggered"); Log.Information("RssWatcherJob - Setting up the job"); Init(options.ConnectionString); var articles = new List(); Log.Information("RssWatcherJob - Requesting sources"); var sources = _source.ListByType(SourceTypes.Rss); Log.Information($"RssWatcherJob - Got {sources.Count()} back"); foreach (var source in sources) { Log.Information($"RssWatcherJob - Starting to proces '{source.Name}'"); Log.Information("RssWatcherJob - Starting to request feed to be processed"); var results = Collect(source.Url, source.ID); Log.Information($"RssWatcherJob - Collected {results.Count()} posts"); articles.AddRange(results); } Log.Information("RssWatcherJob - Sending posts over to the database"); UpdateDatabase(articles); Log.Information("RssWatcherJob - Done!"); } public void InitAndExecute(IConfiguration config) { // reach out to the db and find all the rss feeds var connectionString = config.GetConnectionString("database"); if (connectionString is null) { connectionString = ""; } Init(connectionString); var articles = new List(); var sources = _source.ListByType(SourceTypes.Rss); foreach (var source in sources) { if (source.Enabled == false) { continue; } var results = Collect(source.Url, source.ID); articles.AddRange(results); } UpdateDatabase(articles); } public void Init(string connectionString) { _articles = new ArticlesTable(connectionString); _queue = new DiscordQueueTable(connectionString); _source = new SourcesTable(connectionString); } public List Collect(string url, Guid SourceID, int sleep = 3000) { var CollectedPosts = new List(); using var reader = XmlReader.Create(url); var feed = SyndicationFeed.Load(reader); foreach (var post in feed.Items.ToList()) { var articleUrl = post.Links[0].Uri.AbsoluteUri; // Check if we have seen the url before // If we have, skip and save the site bandwidth if (IsThisUrlKnown(articleUrl) == true) { continue; } var meta = new HtmlPageReader(articleUrl); meta.Parse(); var article = new ArticlesModel { Title = post.Title.Text, Tags = FetchTags(post), URL = articleUrl, PubDate = post.PublishDate.DateTime, Thumbnail = meta.Data.Header.Image, Description = meta.Data.Header.Description, SourceID = SourceID }; CollectedPosts.Add(article); // try to not be too greedy Thread.Sleep(sleep); } return CollectedPosts; } public void UpdateDatabase(List items) { foreach (var item in items) { if (item.URL is null) { Log.Warning($"RSS Watcher collected a blank url and was skipped."); continue; } var p = _articles.New(item); _queue.New(new DiscordQueueModel { ArticleID = p.ID }); } } private bool IsThisUrlKnown(string url) { var isKnown = _articles.GetByUrl(url); if (isKnown.URL == url) { return true; } return false; } private string FetchTags(SyndicationItem post) { string result = ""; foreach (var tag in post.Categories) { result += $"{tag.Name},"; } return result; } }