Newsbot.Collector/Newsbot.Collector.Services/Jobs/YoutubeWatcherJob.cs

193 lines
6.1 KiB
C#
Raw Permalink Normal View History

using System.ServiceModel.Syndication;
using System.Xml;
using Newsbot.Collector.Database.Repositories;
using Newsbot.Collector.Domain.Consts;
2023-06-23 20:09:57 -07:00
using Newsbot.Collector.Domain.Entities;
using Newsbot.Collector.Domain.Interfaces;
using Newsbot.Collector.Domain.Models;
using Newsbot.Collector.Services.HtmlParser;
using Serilog;
namespace Newsbot.Collector.Services.Jobs;
public class YoutubeWatcherJobOptions
{
public string? DatabaseConnectionString { get; set; }
public string? OpenTelemetryConnectionString { get; set; }
public int SleepTimer { get; set; } = 3000;
public bool IsEnabled { get; set; } = true;
}
public class YoutubeWatcherJob
{
private const string JobName = "YoutubeWatcherJob";
private readonly YoutubeWatcherJobOptions _options;
private IArticlesRepository _articles;
private IAuthorTable _author;
private IIconsRepository _icons;
private ILogger _logger;
private IDiscordQueueRepository _queue;
private ISourcesRepository _source;
public YoutubeWatcherJob()
{
_options = new YoutubeWatcherJobOptions();
_articles = new ArticlesTable("");
_author = new AuthorsTable("");
_queue = new DiscordQueueTable("");
_source = new SourcesTable("");
_icons = new IconsTable("");
_logger = JobLogger.GetLogger("", JobName);
}
public void InitAndExecute(YoutubeWatcherJobOptions options)
{
_articles = new ArticlesTable(options.DatabaseConnectionString ?? "");
_author = new AuthorsTable(options.DatabaseConnectionString ?? "");
_queue = new DiscordQueueTable(options.DatabaseConnectionString ?? "");
_source = new SourcesTable(options.DatabaseConnectionString ?? "");
_icons = new IconsTable(options.DatabaseConnectionString ?? "");
_logger = JobLogger.GetLogger(options.OpenTelemetryConnectionString ?? "", JobName);
Execute();
}
private void Execute()
{
2023-08-06 13:37:59 -07:00
var totalSources = _source.TotalByTypeAsync(SourceTypes.YouTube);
var sources = _source.ListByType(SourceTypes.YouTube, 0);
foreach (var source in sources)
{
if (!source.Enabled)
{
_logger.Debug($"{JobName} - {source.Name} was disabled and will be skipped.");
continue;
}
var channelId = source.YoutubeId;
if (channelId == "")
{
channelId = GetChannelId(source.Url);
_source.UpdateYoutubeId(source.Id, channelId);
}
// Make sure we have a Icon for the channel
var icon = _icons.GetBySourceId(source.Id);
if (icon.Id == Guid.Empty) Console.WriteLine("I was triggered :V");
_logger.Information($"{JobName} - Checking '{source.Name}'");
var url = $"https://www.youtube.com/feeds/videos.xml?channel_id={channelId}";
var newVideos = FindMissingPosts(url, source);
_logger.Debug($"{JobName} - Collected {newVideos.Count} new videos");
foreach (var video in newVideos)
{
var author = _author.GetById(video.AuthorId);
author.Wait();
if (author.Result is null)
{
_logger.Warning("Missing author record for article id {VideoId}", video.Id);
}
else
{
_logger.Debug("{JobName} - {ResultName} \'{VideoTitle}\' was found", JobName, author.Result.Name, video.Title);
}
_articles.New(video);
2023-06-23 20:19:09 -07:00
_queue.New(new DiscordQueueEntity
{
2023-06-23 20:19:09 -07:00
ArticleId = video.Id
});
}
}
_logger.Information($"{JobName} - Done");
}
private string GetChannelId(string url)
{
// Collect the Channel ID and store it for later.
var pageReader = new HtmlPageReader(new HtmlPageReaderOptions
{
Url = url
});
pageReader.Parse();
var id = pageReader.Data.Header.YoutubeChannelID ?? "";
if (id == "")
_logger.Error(new Exception($"{JobName} - Unable to find the Youtube Channel ID for the requested url"), "");
return id;
}
private List<ArticlesEntity> FindMissingPosts(string url, SourceEntity source)
{
2023-06-23 20:09:57 -07:00
var videos = new List<ArticlesEntity>();
using var reader = XmlReader.Create(url);
var feed = SyndicationFeed.Load(reader);
foreach (var post in feed.Items.ToList())
{
var article = CheckFeedItem(post, source.Id);
if (article is null) continue;
videos.Add(article);
}
return videos;
}
private ArticlesEntity? CheckFeedItem(SyndicationItem post, Guid sourceId)
{
var articleUrl = post.Links[0].Uri.AbsoluteUri;
if (IsThisUrlKnown(articleUrl)) return null;
var videoDetails = new HtmlPageReader(new HtmlPageReaderOptions
{
Url = articleUrl
});
videoDetails.Parse();
var author = _author.CreateIfMissingAsync(new AuthorEntity
{
Image = post.Authors[0].Uri,
Name = post.Authors[0].Name
});
author.Wait();
var article = new ArticlesEntity
{
//Todo add the icon
AuthorId = author.Result.Id,
Title = post.Title.Text,
Tags = FetchTags(post),
Url = articleUrl,
2023-08-06 13:37:59 -07:00
PubDate = post.PublishDate.DateTime.ToUniversalTime(),
Thumbnail = videoDetails.Data.Header.Image,
Description = videoDetails.Data.Header.Description,
SourceId = sourceId,
Video = "true"
};
return article;
}
private bool IsThisUrlKnown(string url)
{
var isKnown = _articles.GetByUrl(url);
2023-06-23 20:09:57 -07:00
if (isKnown.Url == url) return true;
return false;
}
private static string FetchTags(SyndicationItem post)
{
var result = "";
foreach (var tag in post.Categories) result += $"{tag.Name},";
return result;
}
}