using System.ServiceModel.Syndication; using System.Xml; using Newsbot.Collector.Database.Repositories; using Newsbot.Collector.Domain.Consts; using Newsbot.Collector.Domain.Interfaces; using Newsbot.Collector.Domain.Models; using Newsbot.Collector.Services.HtmlParser; using Serilog; namespace Newsbot.Collector.Services.Jobs; public class RssWatcherJobOptions { public string? ConnectionString { get; init; } public string? OpenTelemetry { get; init; } } // This class was made to work with Hangfire and it does not support constructors. public class RssWatcherJob { private IArticlesRepository _articles; private ILogger _logger; private IDiscordQueueRepository _queue; private ISourcesRepository _source; public RssWatcherJob() { _articles = new ArticlesTable(""); _queue = new DiscordQueueTable(""); _source = new SourcesTable(""); _logger = GetLogger(""); } public void InitAndExecute(RssWatcherJobOptions options) { Init(options); _logger.Information("RssWatcherJob - Job was triggered"); _logger.Information("RssWatcherJob - Setting up the job"); Execute(); } private ILogger GetLogger(string connectionString) { return Log.Logger = new LoggerConfiguration() .WriteTo.Console() .WriteTo.OpenTelemetry( connectionString, resourceAttributes: new Dictionary { { "service.name", "newsbot-collector-api" }, { "Job", "RssWatcherJob" } }) .CreateLogger(); } public void Init(RssWatcherJobOptions options) { _articles = new ArticlesTable(options.ConnectionString ?? ""); _queue = new DiscordQueueTable(options.ConnectionString ?? ""); _source = new SourcesTable(options.ConnectionString ?? ""); _logger = GetLogger(options.OpenTelemetry ?? ""); } public void Execute() { var articles = new List(); _logger.Information("RssWatcherJob - Requesting sources"); var sources = _source.ListByType(SourceTypes.Rss); _logger.Information($"RssWatcherJob - Got {sources.Count} back"); foreach (var source in sources) { _logger.Information($"RssWatcherJob - Starting to process '{source.Name}'"); _logger.Information("RssWatcherJob - Starting to request feed to be processed"); var results = Collect(source.Url, source.ID); _logger.Information($"RssWatcherJob - Collected {results.Count} posts"); articles.AddRange(results); } _logger.Information("RssWatcherJob - Sending posts over to the database"); UpdateDatabase(articles); _logger.Information("RssWatcherJob - Done!"); } public List Collect(string url, Guid sourceId, int sleep = 3000) { var collectedPosts = new List(); using var reader = XmlReader.Create(url); var feed = SyndicationFeed.Load(reader); foreach (var post in feed.Items.ToList()) { var articleUrl = post.Links[0].Uri.AbsoluteUri; // Check if we have seen the url before // If we have, skip and save the site bandwidth if (IsThisUrlKnown(articleUrl)) continue; var meta = new HtmlPageReader(articleUrl); meta.Parse(); var article = new ArticlesModel { Title = post.Title.Text, Tags = FetchTags(post), URL = articleUrl, PubDate = post.PublishDate.DateTime, Thumbnail = meta.Data.Header.Image, Description = meta.Data.Header.Description, SourceID = sourceId }; collectedPosts.Add(article); // try to not be too greedy Thread.Sleep(sleep); } return collectedPosts; } public void UpdateDatabase(List items) { foreach (var item in items) { if (item.URL is null) { Log.Warning("RSS Watcher collected a blank url and was skipped."); continue; } var p = _articles.New(item); _queue.New(new DiscordQueueModel { ArticleID = p.ID }); } } private bool IsThisUrlKnown(string url) { var isKnown = _articles.GetByUrl(url); if (isKnown.URL == url) return true; return false; } private string FetchTags(SyndicationItem post) { var result = ""; foreach (var tag in post.Categories) result += $"{tag.Name},"; return result; } }