66 lines
1.6 KiB
C#
66 lines
1.6 KiB
C#
|
using System.ServiceModel.Syndication;
|
||
|
using System.Xml;
|
||
|
using Newsbot.Collector.Domain.Interfaces;
|
||
|
using Newsbot.Collector.Domain.Models;
|
||
|
|
||
|
namespace Newsbot.Collector.Services.Jobs;
|
||
|
|
||
|
public class RssWatcherJob : ICollector
|
||
|
{
|
||
|
|
||
|
private string? _url;
|
||
|
|
||
|
public RssWatcherJob(string url)
|
||
|
{
|
||
|
_url = url;
|
||
|
}
|
||
|
|
||
|
public List<ArticlesModel> Collect()
|
||
|
{
|
||
|
var CollectedPosts = new List<ArticlesModel>();
|
||
|
|
||
|
if (_url is null)
|
||
|
{
|
||
|
_url = "";
|
||
|
}
|
||
|
|
||
|
using var reader = XmlReader.Create(_url);
|
||
|
var feed = SyndicationFeed.Load(reader);
|
||
|
var posts = feed.Items.ToList();
|
||
|
|
||
|
foreach (var post in posts)
|
||
|
{
|
||
|
var url = post.Links[0].Uri.AbsoluteUri;
|
||
|
|
||
|
// Check if we have seen the url before
|
||
|
// If we have, skip and save the site bandwidth
|
||
|
|
||
|
var meta = new HtmlPageReader(url);
|
||
|
|
||
|
var article = new ArticlesModel
|
||
|
{
|
||
|
Title = post.Title.Text,
|
||
|
Tags = FetchTags(post),
|
||
|
URL = post.Links[0].Uri.ToString(),
|
||
|
PubDate = post.PublishDate.DateTime,
|
||
|
Thumbnail = meta.Data.Header.Meta.Image,
|
||
|
Description = meta.Data.Header.Meta.Description,
|
||
|
};
|
||
|
CollectedPosts.Add(article);
|
||
|
|
||
|
// try to not be too greedy
|
||
|
Thread.Sleep(3000);
|
||
|
}
|
||
|
return CollectedPosts;
|
||
|
}
|
||
|
|
||
|
private string FetchTags(SyndicationItem post)
|
||
|
{
|
||
|
string result = "";
|
||
|
foreach (var tag in post.Categories)
|
||
|
{
|
||
|
result += $"{tag.Name},";
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
}
|