using System.ServiceModel.Syndication; using System.Xml; using Newsbot.Collector.Database.Repositories; using Newsbot.Collector.Domain.Consts; using Newsbot.Collector.Domain.Interfaces; using Newsbot.Collector.Domain.Models; using Newsbot.Collector.Domain.Models.Config; using Newsbot.Collector.Services.HtmlParser; using Serilog; namespace Newsbot.Collector.Services.Jobs; public class RssWatcherJobOptions { //public string? ConnectionString { get; init; } //public string? OpenTelemetry { get; init; } public ConfigSectionConnectionStrings? ConnectionStrings { get; set; } public ConfigSectionRssModel? Config { get; set; } } // This class was made to work with Hangfire and it does not support constructors. public class RssWatcherJob { private const string JobName = "RssWatcherJob"; private IArticlesRepository _articles; private ILogger _logger; private IDiscordQueueRepository _queue; private ISourcesRepository _source; public RssWatcherJob() { _articles = new ArticlesTable(""); _queue = new DiscordQueueTable(""); _source = new SourcesTable(""); _logger = JobLogger.GetLogger("", JobName); } public void InitAndExecute(RssWatcherJobOptions options) { options.ConnectionStrings ??= new ConfigSectionConnectionStrings(); options.Config ??= new ConfigSectionRssModel(); _articles = new ArticlesTable(options.ConnectionStrings.Database ?? ""); _queue = new DiscordQueueTable(options.ConnectionStrings.Database ?? ""); _source = new SourcesTable(options.ConnectionStrings.Database ?? ""); _logger = JobLogger.GetLogger(options.ConnectionStrings.OpenTelemetry ?? "", JobName); _logger.Information($"{JobName} - Job was triggered"); if (!options.Config.IsEnabled) { _logger.Information($"{JobName} - Going to exit because feature flag is off."); return; } _logger.Information($"{JobName} - Setting up the job"); Execute(); } public void Execute() { var articles = new List(); _logger.Information($"{JobName} - Requesting sources"); var sources = _source.ListByType(SourceTypes.Rss); _logger.Information($"{JobName} - Got {sources.Count} back"); foreach (var source in sources) { _logger.Information($"{JobName} - Starting to process '{source.Name}'"); _logger.Information($"{JobName} - Starting to request feed to be processed"); var results = Collect(source.Url, source.ID); _logger.Information($"{JobName} - Collected {results.Count} posts"); articles.AddRange(results); } _logger.Information($"{JobName} - Sending posts over to the database"); UpdateDatabase(articles); _logger.Information($"{JobName} - Done!"); } public List Collect(string url, Guid sourceId, int sleep = 3000) { var collectedPosts = new List(); using var reader = XmlReader.Create(url); var feed = SyndicationFeed.Load(reader); foreach (var post in feed.Items.ToList()) { var articleUrl = post.Links[0].Uri.AbsoluteUri; // Check if we have seen the url before // If we have, skip and save the site bandwidth if (IsThisUrlKnown(articleUrl)) continue; var meta = new HtmlPageReader(new HtmlPageReaderOptions { Url = articleUrl }); meta.Parse(); var article = new ArticlesModel { Title = post.Title.Text, Tags = FetchTags(post), URL = articleUrl, PubDate = post.PublishDate.DateTime, Thumbnail = meta.Data.Header.Image, Description = meta.Data.Header.Description, SourceID = sourceId }; collectedPosts.Add(article); // try to not be too greedy Thread.Sleep(sleep); } return collectedPosts; } public void UpdateDatabase(List items) { foreach (var item in items) { if (item.URL is null) { Log.Warning("RSS Watcher collected a blank url and was skipped."); continue; } var p = _articles.New(item); _queue.New(new DiscordQueueModel { ArticleID = p.ID }); } } private bool IsThisUrlKnown(string url) { var isKnown = _articles.GetByUrl(url); if (isKnown.URL == url) return true; return false; } private string FetchTags(SyndicationItem post) { var result = ""; foreach (var tag in post.Categories) result += $"{tag.Name},"; return result; } }