Newsbot.Collector/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs

using System.ServiceModel.Syndication;
using System.Xml;
using Newsbot.Collector.Domain.Interfaces;
using Newsbot.Collector.Domain.Models;

namespace Newsbot.Collector.Services.Jobs;

public class RssWatcherJob : ICollector
{

    private string? _url;

    public RssWatcherJob(string url)
    {
        _url = url;
    }

    public List<ArticlesModel> Collect()
    {
        var CollectedPosts = new List<ArticlesModel>();

        if (_url is null)
        {
            _url = "";
        }

        using var reader = XmlReader.Create(_url);
        var feed = SyndicationFeed.Load(reader);
        var posts = feed.Items.ToList();

        foreach (var post in posts)
        {
            var url = post.Links[0].Uri.AbsoluteUri;

            // Check if we have seen the url before
            // If we have, skip and save the site bandwidth

            var meta = new HtmlPageReader(url);

            var article = new ArticlesModel
            {
                Title = post.Title.Text,
                Tags = FetchTags(post),
                URL = post.Links[0].Uri.ToString(),
                PubDate = post.PublishDate.DateTime,
                Thumbnail = meta.Data.Header.Meta.Image,
                Description = meta.Data.Header.Meta.Description,
            };
            CollectedPosts.Add(article);

            // try to not be too greedy
            Thread.Sleep(3000);
        }
        return CollectedPosts;
    }

    private string FetchTags(SyndicationItem post)
    {
        string result = "";
        foreach (var tag in post.Categories)
        {
            result += $"{tag.Name},";
        }
        return result;
    }
}
Features/html meta extractor (#4) * gave api access to the db project * added db models * working on rss extraction and meta extraction * test project to debug rsswatcherjob * added new configs for the project * new interface to define collectors * basic rss extraction and article details are now exposed * tests updated for rss pull * starting to get dapper working. Query works but insert seems to have a value issue * removed dapper from services * added some basic tests for db calls 2023-02-16 22:19:05 -08:00			`using System.ServiceModel.Syndication;`
			`using System.Xml;`
			`using Newsbot.Collector.Domain.Interfaces;`
			`using Newsbot.Collector.Domain.Models;`

			`namespace Newsbot.Collector.Services.Jobs;`

			`public class RssWatcherJob : ICollector`
			`{`

			`private string? _url;`

			`public RssWatcherJob(string url)`
			`{`
			`_url = url;`
			`}`

			`public List<ArticlesModel> Collect()`
			`{`
			`var CollectedPosts = new List<ArticlesModel>();`

			`if (_url is null)`
			`{`
			`_url = "";`
			`}`

			`using var reader = XmlReader.Create(_url);`
			`var feed = SyndicationFeed.Load(reader);`
			`var posts = feed.Items.ToList();`

			`foreach (var post in posts)`
			`{`
			`var url = post.Links[0].Uri.AbsoluteUri;`

			`// Check if we have seen the url before`
			`// If we have, skip and save the site bandwidth`

			`var meta = new HtmlPageReader(url);`

			`var article = new ArticlesModel`
			`{`
			`Title = post.Title.Text,`
			`Tags = FetchTags(post),`
			`URL = post.Links[0].Uri.ToString(),`
			`PubDate = post.PublishDate.DateTime,`
			`Thumbnail = meta.Data.Header.Meta.Image,`
			`Description = meta.Data.Header.Meta.Description,`
			`};`
			`CollectedPosts.Add(article);`

			`// try to not be too greedy`
			`Thread.Sleep(3000);`
			`}`
			`return CollectedPosts;`
			`}`

			`private string FetchTags(SyndicationItem post)`
			`{`
			`string result = "";`
			`foreach (var tag in post.Categories)`
			`{`
			`result += $"{tag.Name},";`
			`}`
			`return result;`
			`}`
			`}`