2023-02-16 22:19:05 -08:00
|
|
|
using System.ServiceModel.Syndication;
|
|
|
|
using System.Xml;
|
2023-02-19 21:39:03 -08:00
|
|
|
using Newsbot.Collector.Database.Repositories;
|
|
|
|
using Newsbot.Collector.Domain.Consts;
|
2023-02-16 22:19:05 -08:00
|
|
|
using Newsbot.Collector.Domain.Interfaces;
|
|
|
|
using Newsbot.Collector.Domain.Models;
|
2023-03-31 22:49:39 -07:00
|
|
|
using Newsbot.Collector.Domain.Models.Config;
|
2023-02-26 09:40:04 -08:00
|
|
|
using Newsbot.Collector.Services.HtmlParser;
|
|
|
|
using Serilog;
|
2023-02-16 22:19:05 -08:00
|
|
|
|
|
|
|
namespace Newsbot.Collector.Services.Jobs;
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
public class RssWatcherJobOptions
|
2023-02-16 22:19:05 -08:00
|
|
|
{
|
2023-04-01 08:53:34 -07:00
|
|
|
public string? ConnectionString { get; init; }
|
|
|
|
public string? OpenTelemetry { get; init; }
|
|
|
|
public bool IsEnabled { get; init; }
|
2023-02-19 21:39:03 -08:00
|
|
|
}
|
2023-02-16 22:19:05 -08:00
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
// This class was made to work with Hangfire and it does not support constructors.
|
2023-03-19 22:54:17 -07:00
|
|
|
public class RssWatcherJob
|
2023-02-19 21:39:03 -08:00
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
private const string JobName = "RssWatcherJob";
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
private IArticlesRepository _articles;
|
2023-03-19 22:54:17 -07:00
|
|
|
private ILogger _logger;
|
2023-02-19 21:39:03 -08:00
|
|
|
private IDiscordQueueRepository _queue;
|
|
|
|
private ISourcesRepository _source;
|
2023-02-16 22:19:05 -08:00
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
public RssWatcherJob()
|
2023-02-16 22:19:05 -08:00
|
|
|
{
|
2023-02-19 21:39:03 -08:00
|
|
|
_articles = new ArticlesTable("");
|
|
|
|
_queue = new DiscordQueueTable("");
|
|
|
|
_source = new SourcesTable("");
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger = JobLogger.GetLogger("", JobName);
|
2023-02-16 22:19:05 -08:00
|
|
|
}
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
public void InitAndExecute(RssWatcherJobOptions options)
|
2023-02-16 22:19:05 -08:00
|
|
|
{
|
2023-04-01 08:53:34 -07:00
|
|
|
_articles = new ArticlesTable(options.ConnectionString ?? "");
|
|
|
|
_queue = new DiscordQueueTable(options.ConnectionString ?? "");
|
|
|
|
_source = new SourcesTable(options.ConnectionString ?? "");
|
|
|
|
_logger = JobLogger.GetLogger(options.OpenTelemetry ?? "", JobName);
|
2023-02-19 21:39:03 -08:00
|
|
|
|
2023-04-01 08:53:34 -07:00
|
|
|
_logger.Information($"{JobName} - Job was triggered.");
|
|
|
|
if (!options.IsEnabled)
|
2023-03-31 22:49:39 -07:00
|
|
|
{
|
|
|
|
_logger.Information($"{JobName} - Going to exit because feature flag is off.");
|
|
|
|
return;
|
|
|
|
}
|
2023-02-26 09:40:04 -08:00
|
|
|
|
2023-04-01 08:53:34 -07:00
|
|
|
_logger.Information($"{JobName} - Setting up the job.");
|
2023-02-16 22:19:05 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
Execute();
|
2023-03-19 22:54:17 -07:00
|
|
|
}
|
2023-02-19 21:39:03 -08:00
|
|
|
|
2023-03-19 22:54:17 -07:00
|
|
|
public void Execute()
|
|
|
|
{
|
2023-02-19 21:39:03 -08:00
|
|
|
var articles = new List<ArticlesModel>();
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Requesting sources");
|
2023-02-19 21:39:03 -08:00
|
|
|
var sources = _source.ListByType(SourceTypes.Rss);
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Got {sources.Count} back");
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
foreach (var source in sources)
|
2023-02-16 22:19:05 -08:00
|
|
|
{
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Starting to process '{source.Name}'");
|
|
|
|
_logger.Information($"{JobName} - Starting to request feed to be processed");
|
2023-02-26 09:40:04 -08:00
|
|
|
var results = Collect(source.Url, source.ID);
|
2023-02-19 21:39:03 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Collected {results.Count} posts");
|
2023-02-19 21:39:03 -08:00
|
|
|
articles.AddRange(results);
|
2023-02-16 22:19:05 -08:00
|
|
|
}
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Sending posts over to the database");
|
2023-02-19 21:39:03 -08:00
|
|
|
UpdateDatabase(articles);
|
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
_logger.Information($"{JobName} - Done!");
|
2023-02-19 21:39:03 -08:00
|
|
|
}
|
|
|
|
|
2023-03-19 22:54:17 -07:00
|
|
|
public List<ArticlesModel> Collect(string url, Guid sourceId, int sleep = 3000)
|
2023-02-19 21:39:03 -08:00
|
|
|
{
|
2023-03-19 22:54:17 -07:00
|
|
|
var collectedPosts = new List<ArticlesModel>();
|
2023-02-19 21:39:03 -08:00
|
|
|
|
|
|
|
using var reader = XmlReader.Create(url);
|
2023-02-16 22:19:05 -08:00
|
|
|
var feed = SyndicationFeed.Load(reader);
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
foreach (var post in feed.Items.ToList())
|
2023-02-16 22:19:05 -08:00
|
|
|
{
|
2023-02-19 21:39:03 -08:00
|
|
|
var articleUrl = post.Links[0].Uri.AbsoluteUri;
|
2023-02-16 22:19:05 -08:00
|
|
|
|
|
|
|
// Check if we have seen the url before
|
|
|
|
// If we have, skip and save the site bandwidth
|
2023-03-19 22:54:17 -07:00
|
|
|
if (IsThisUrlKnown(articleUrl)) continue;
|
2023-02-16 22:19:05 -08:00
|
|
|
|
2023-03-31 22:49:39 -07:00
|
|
|
var meta = new HtmlPageReader(new HtmlPageReaderOptions
|
|
|
|
{
|
|
|
|
Url = articleUrl
|
|
|
|
});
|
2023-02-26 09:40:04 -08:00
|
|
|
meta.Parse();
|
2023-02-16 22:19:05 -08:00
|
|
|
|
|
|
|
var article = new ArticlesModel
|
|
|
|
{
|
|
|
|
Title = post.Title.Text,
|
|
|
|
Tags = FetchTags(post),
|
2023-02-19 21:39:03 -08:00
|
|
|
URL = articleUrl,
|
2023-02-16 22:19:05 -08:00
|
|
|
PubDate = post.PublishDate.DateTime,
|
2023-02-26 09:40:04 -08:00
|
|
|
Thumbnail = meta.Data.Header.Image,
|
|
|
|
Description = meta.Data.Header.Description,
|
2023-03-19 22:54:17 -07:00
|
|
|
SourceID = sourceId
|
2023-02-16 22:19:05 -08:00
|
|
|
};
|
2023-02-19 21:39:03 -08:00
|
|
|
|
2023-03-19 22:54:17 -07:00
|
|
|
collectedPosts.Add(article);
|
2023-02-16 22:19:05 -08:00
|
|
|
|
|
|
|
// try to not be too greedy
|
2023-02-19 21:39:03 -08:00
|
|
|
Thread.Sleep(sleep);
|
2023-02-16 22:19:05 -08:00
|
|
|
}
|
2023-03-19 22:54:17 -07:00
|
|
|
|
|
|
|
return collectedPosts;
|
2023-02-16 22:19:05 -08:00
|
|
|
}
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
public void UpdateDatabase(List<ArticlesModel> items)
|
|
|
|
{
|
|
|
|
foreach (var item in items)
|
|
|
|
{
|
2023-03-05 20:12:59 -08:00
|
|
|
if (item.URL is null)
|
|
|
|
{
|
2023-03-19 22:54:17 -07:00
|
|
|
Log.Warning("RSS Watcher collected a blank url and was skipped.");
|
2023-03-05 20:12:59 -08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-02-19 21:39:03 -08:00
|
|
|
var p = _articles.New(item);
|
|
|
|
_queue.New(new DiscordQueueModel
|
|
|
|
{
|
|
|
|
ArticleID = p.ID
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private bool IsThisUrlKnown(string url)
|
|
|
|
{
|
|
|
|
var isKnown = _articles.GetByUrl(url);
|
2023-03-19 22:54:17 -07:00
|
|
|
if (isKnown.URL == url) return true;
|
2023-02-19 21:39:03 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-02-16 22:19:05 -08:00
|
|
|
private string FetchTags(SyndicationItem post)
|
|
|
|
{
|
2023-03-19 22:54:17 -07:00
|
|
|
var result = "";
|
|
|
|
foreach (var tag in post.Categories) result += $"{tag.Name},";
|
2023-02-16 22:19:05 -08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|