using System.ServiceModel.Syndication; using System.Xml; using Newsbot.Collector.Database.Repositories; using Newsbot.Collector.Domain.Consts; using Newsbot.Collector.Domain.Entities; using Newsbot.Collector.Domain.Interfaces; using Newsbot.Collector.Domain.Models; using Newsbot.Collector.Domain.Models.Config; using Newsbot.Collector.Services.HtmlParser; using Serilog; namespace Newsbot.Collector.Services.Jobs; public class RssWatcherJobOptions { public string? ConnectionString { get; init; } public string? OpenTelemetry { get; init; } public bool IsEnabled { get; init; } } // This class was made to work with Hangfire and it does not support constructors. public class RssWatcherJob { private const string JobName = "RssWatcherJob"; private IArticlesRepository _articles; private ILogger _logger; private IDiscordQueueRepository _queue; private ISourcesRepository _source; public RssWatcherJob() { _articles = new ArticlesTable(""); _queue = new DiscordQueueTable(""); _source = new SourcesTable(""); _logger = JobLogger.GetLogger("", JobName); } public void InitAndExecute(RssWatcherJobOptions options) { _articles = new ArticlesTable(options.ConnectionString ?? ""); _queue = new DiscordQueueTable(options.ConnectionString ?? ""); _source = new SourcesTable(options.ConnectionString ?? ""); _logger = JobLogger.GetLogger(options.OpenTelemetry ?? "", JobName); _logger.Information($"{JobName} - Job was triggered."); if (!options.IsEnabled) { _logger.Information($"{JobName} - Going to exit because feature flag is off."); return; } _logger.Information($"{JobName} - Setting up the job."); Execute(); } public void Execute() { var articles = new List(); _logger.Information($"{JobName} - Requesting sources"); var sources = _source.ListByType(SourceTypes.Rss, 0, 100); _logger.Information("{JobName} - Got {SourcesCount} back", JobName, sources.Count); foreach (var source in sources) { _logger.Information("{JobName} - Starting to process \'{SourceName}\'", JobName, source.Name); _logger.Information($"{JobName} - Starting to request feed to be processed"); var results = Collect(source.Url, source.Id); _logger.Information($"{JobName} - Collected {results.Count} posts"); articles.AddRange(results); } _logger.Information($"{JobName} - Sending posts over to the database"); UpdateDatabase(articles); _logger.Information($"{JobName} - Done!"); } public List Collect(string url, Guid sourceId, int sleep = 3000) { var collectedPosts = new List(); using var reader = XmlReader.Create(url); var feed = SyndicationFeed.Load(reader); foreach (var post in feed.Items.ToList()) { var articleUrl = post.Links[0].Uri.AbsoluteUri; // Check if we have seen the url before // If we have, skip and save the site bandwidth if (IsThisUrlKnown(articleUrl)) continue; var meta = new HtmlPageReader(new HtmlPageReaderOptions { Url = articleUrl }); meta.Parse(); var article = new ArticlesEntity { Title = post.Title.Text, Tags = FetchTags(post), Url = articleUrl, PubDate = post.PublishDate.DateTime.ToUniversalTime(), Thumbnail = meta.Data.Header.Image, Description = meta.Data.Header.Description, SourceId = sourceId }; collectedPosts.Add(article); // try to not be too greedy Thread.Sleep(sleep); } return collectedPosts; } public void UpdateDatabase(List items) { foreach (var item in items) { if (item.Url is null) { Log.Warning("RSS Watcher collected a blank url and was skipped"); continue; } var p = _articles.New(item); _queue.New(new DiscordQueueEntity { ArticleId = p.Id }); } } private bool IsThisUrlKnown(string url) { var isKnown = _articles.GetByUrl(url); if (isKnown.Url == url) return true; return false; } private string FetchTags(SyndicationItem post) { var result = ""; foreach (var tag in post.Categories) result += $"{tag.Name},"; return result; } }