Newsbot.Collector/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs
James Tombleson 9f3a6323a6
Features/html meta extractor (#4)
* gave api access to the db project

* added db models

* working on rss extraction and meta extraction

* test project to debug rsswatcherjob

* added new configs for the project

* new interface to define collectors

* basic rss extraction and article details are now exposed

* tests updated for rss pull

* starting to get dapper working.  Query works but insert seems to have a value issue

* removed dapper from services

* added some basic tests for db calls
2023-02-16 22:19:05 -08:00

66 lines
1.6 KiB
C#

using System.ServiceModel.Syndication;
using System.Xml;
using Newsbot.Collector.Domain.Interfaces;
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Services.Jobs;
public class RssWatcherJob : ICollector
{
private string? _url;
public RssWatcherJob(string url)
{
_url = url;
}
public List<ArticlesModel> Collect()
{
var CollectedPosts = new List<ArticlesModel>();
if (_url is null)
{
_url = "";
}
using var reader = XmlReader.Create(_url);
var feed = SyndicationFeed.Load(reader);
var posts = feed.Items.ToList();
foreach (var post in posts)
{
var url = post.Links[0].Uri.AbsoluteUri;
// Check if we have seen the url before
// If we have, skip and save the site bandwidth
var meta = new HtmlPageReader(url);
var article = new ArticlesModel
{
Title = post.Title.Text,
Tags = FetchTags(post),
URL = post.Links[0].Uri.ToString(),
PubDate = post.PublishDate.DateTime,
Thumbnail = meta.Data.Header.Meta.Image,
Description = meta.Data.Header.Meta.Description,
};
CollectedPosts.Add(article);
// try to not be too greedy
Thread.Sleep(3000);
}
return CollectedPosts;
}
private string FetchTags(SyndicationItem post)
{
string result = "";
foreach (var tag in post.Categories)
{
result += $"{tag.Name},";
}
return result;
}
}