From 9f3a6323a69d2a2b424ee05f7495c2b825f68ca8 Mon Sep 17 00:00:00 2001 From: James Tombleson Date: Thu, 16 Feb 2023 22:19:05 -0800 Subject: [PATCH] Features/html meta extractor (#4) * gave api access to the db project * added db models * working on rss extraction and meta extraction * test project to debug rsswatcherjob * added new configs for the project * new interface to define collectors * basic rss extraction and article details are now exposed * tests updated for rss pull * starting to get dapper working. Query works but insert seems to have a value issue * removed dapper from services * added some basic tests for db calls --- .vscode/settings.json | 6 +- .../Newsbot.Collector.Api.csproj | 1 + Newsbot.Collector.Api/Program.cs | 4 +- .../Newsbot.Collector.Database.csproj | 5 + .../Repositories/ArticlesTable.cs | 79 +++++++++++ .../Repositories/SettingsTable.cs | 34 +++++ .../Interfaces/ICollector.cs | 4 +- .../Models/DatabaseModel.cs | 3 +- Newsbot.Collector.Services/EnvLoader.cs | 77 ----------- Newsbot.Collector.Services/HtmlMeta.cs | 129 ++++++++++++++++++ Newsbot.Collector.Services/Jobs/Factory.cs | 0 .../Jobs/HelloWorldJob.cs | 7 +- .../Jobs/RssWatcherJob.cs | 66 +++++++++ .../Newsbot.Collector.Services.csproj | 4 +- .../Jobs/RssWatcherJobTest.cs | 14 ++ .../Newsbot.Collector.Tests.csproj | 5 + .../Tables/ArticlesTableTests.cs | 40 ++++++ .../Tables/SettingsTableTests.cs | 19 +++ 18 files changed, 412 insertions(+), 85 deletions(-) create mode 100644 Newsbot.Collector.Database/Repositories/ArticlesTable.cs create mode 100644 Newsbot.Collector.Database/Repositories/SettingsTable.cs delete mode 100644 Newsbot.Collector.Services/EnvLoader.cs create mode 100644 Newsbot.Collector.Services/HtmlMeta.cs delete mode 100644 Newsbot.Collector.Services/Jobs/Factory.cs create mode 100644 Newsbot.Collector.Services/Jobs/RssWatcherJob.cs create mode 100644 Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs create mode 100644 Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs create mode 100644 Newsbot.Collector.Tests/Tables/SettingsTableTests.cs diff --git a/.vscode/settings.json b/.vscode/settings.json index 1c6dc21..b285125 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,8 @@ { "files.exclude": { "**/obj": true, - "**/bin": true - } + "**/bin": false + }, + "csharp.inlayHints.types.enabled": true, + "omnisharp.enableImportCompletion": true } \ No newline at end of file diff --git a/Newsbot.Collector.Api/Newsbot.Collector.Api.csproj b/Newsbot.Collector.Api/Newsbot.Collector.Api.csproj index d2595e3..0c9a799 100644 --- a/Newsbot.Collector.Api/Newsbot.Collector.Api.csproj +++ b/Newsbot.Collector.Api/Newsbot.Collector.Api.csproj @@ -16,6 +16,7 @@ + diff --git a/Newsbot.Collector.Api/Program.cs b/Newsbot.Collector.Api/Program.cs index 7656abc..3f0af26 100644 --- a/Newsbot.Collector.Api/Program.cs +++ b/Newsbot.Collector.Api/Program.cs @@ -1,6 +1,6 @@ using Hangfire; using Hangfire.MemoryStorage; -using Newsbot.Collector.Services; +using Newsbot.Collector.Services.Jobs; using Newsbot.Collector.Domain.Models; var builder = WebApplication.CreateBuilder(args); @@ -35,7 +35,7 @@ if (app.Environment.IsDevelopment()) app.UseHttpsRedirection(); app.UseHangfireDashboard(); -//RecurringJob.AddOrUpdate() +RecurringJob.AddOrUpdate("Example", x => x.Execute(), "0/2 * * * *"); app.UseAuthorization(); diff --git a/Newsbot.Collector.Database/Newsbot.Collector.Database.csproj b/Newsbot.Collector.Database/Newsbot.Collector.Database.csproj index a58561a..876eb77 100644 --- a/Newsbot.Collector.Database/Newsbot.Collector.Database.csproj +++ b/Newsbot.Collector.Database/Newsbot.Collector.Database.csproj @@ -4,6 +4,11 @@ + + + + + net7.0 enable diff --git a/Newsbot.Collector.Database/Repositories/ArticlesTable.cs b/Newsbot.Collector.Database/Repositories/ArticlesTable.cs new file mode 100644 index 0000000..a7a0344 --- /dev/null +++ b/Newsbot.Collector.Database/Repositories/ArticlesTable.cs @@ -0,0 +1,79 @@ +using System.Data; +using Dapper; +using Newsbot.Collector.Domain.Models; +using Npgsql; + +namespace Newsbot.Collector.Database.Repositories; + +public class ArticlesTable +{ + + private string _connectionString; + + public ArticlesTable(string connectionString) + { + _connectionString = connectionString; + } + + public static IDbConnection OpenConnection(string connectionString) + { + var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable"; + var conn = new NpgsqlConnection(cs); + conn.Open(); + return conn; + } + + public List List(int Page = 0, int Count = 25) + { + using var conn = OpenConnection(_connectionString); + var res = conn.Query(@"select * from articles + Order By PubDate Desc + Offset @Page + Fetch Next @Count Rows Only", new { Page = Page * Count, Count = Count }).ToList(); + return res; + } + + public ArticlesModel GetById(Guid ID) + { + using var conn = OpenConnection(_connectionString); + var res = conn.Query("select * from articles where ID = @ID", new { ID = ID }); + return res.First(); + } + + public ArticlesModel GetByUrl(string url) + { + using var conn = OpenConnection(_connectionString); + var res = conn.Query("select * from articles where Url = @Url Limit 1", new { Url = url }); + return res.First(); + } + + public void New(ArticlesModel model) + { + model.ID = Guid.NewGuid(); + + using var conn = OpenConnection(_connectionString); + var q = @"INSERT INTO Articles + (ID, SourceId, Tags, Title, Url, PubDate, Video, VideoHeight, VideoWidth, Thumbnail, Description, AuthorName, AuthorImage) + Values + (@Id, @SourceId, @Tags, @Title, @Url, @PubDate, @Video, @VideoHeight, @VideoWidth, @Thumbnail, @Description, @AuthorName, @AuthorImage); + "; + var res = conn.Execute(q, model); + //new{ + // Id = Guid.NewGuid(), + // SourceId = model.SourceID, + // Tags = model.Tags, + // Title = model.Title, + // Url = model.URL, + // PubDate = model.PubDate, + // Video = model.Video, + // VideoHeight = model.VideoHeight, + // VideoWidth = model.VideoWidth, + // Thumbnail = model.Thumbnail, + // Description = model.Description, + // AuthorName = model.AuthorName, + // AuthorImage = model.AuthorImage + //}); + Console.WriteLine(res); + } + +} \ No newline at end of file diff --git a/Newsbot.Collector.Database/Repositories/SettingsTable.cs b/Newsbot.Collector.Database/Repositories/SettingsTable.cs new file mode 100644 index 0000000..1cd784f --- /dev/null +++ b/Newsbot.Collector.Database/Repositories/SettingsTable.cs @@ -0,0 +1,34 @@ +using System.Data; +using Dapper; +using Newsbot.Collector.Domain.Models; +using Npgsql; + +namespace Newsbot.Collector.Database.Repositories; + +public class SettingsTable +{ + + private string _connectionString; + + public SettingsTable(string connectionString) + { + _connectionString = connectionString; + } + + public static IDbConnection OpenConnection(string connectionString) + { + var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable"; + var conn = new NpgsqlConnection(cs); + conn.Open(); + return conn; + } + + public void New(SettingModel model) + { + model.ID = Guid.NewGuid(); + + using var conn = OpenConnection(_connectionString); + var q = @"Insert Into Settings (ID, Key, Value, OPTIONS) Values (@ID,@Key,@Value,@Options)"; + conn.Execute(q, model); + } +} \ No newline at end of file diff --git a/Newsbot.Collector.Domain/Interfaces/ICollector.cs b/Newsbot.Collector.Domain/Interfaces/ICollector.cs index 8a6dd43..bca8bfd 100644 --- a/Newsbot.Collector.Domain/Interfaces/ICollector.cs +++ b/Newsbot.Collector.Domain/Interfaces/ICollector.cs @@ -1,6 +1,8 @@ +using Newsbot.Collector.Domain.Models; + namespace Newsbot.Collector.Domain.Interfaces; public interface ICollector { - void Collect(); + List Collect(); } \ No newline at end of file diff --git a/Newsbot.Collector.Domain/Models/DatabaseModel.cs b/Newsbot.Collector.Domain/Models/DatabaseModel.cs index 1f32a86..c3af5cb 100644 --- a/Newsbot.Collector.Domain/Models/DatabaseModel.cs +++ b/Newsbot.Collector.Domain/Models/DatabaseModel.cs @@ -5,6 +5,7 @@ public class ArticlesModel public Guid ID { get; set; } public Guid SourceID { get; set; } public string Tags { get; set; } = ""; + public string Title { get; set; } = ""; public string URL { get; set; } = ""; public DateTime PubDate { get; set; } public string Video { get; set; } = ""; @@ -72,5 +73,5 @@ public class SubscriptionModel { public Guid ID { get; set; } public Guid DiscordWebHookID { get; set; } - public Guid SourceID { get; set;} + public Guid SourceID { get; set; } } \ No newline at end of file diff --git a/Newsbot.Collector.Services/EnvLoader.cs b/Newsbot.Collector.Services/EnvLoader.cs deleted file mode 100644 index 2dd8c49..0000000 --- a/Newsbot.Collector.Services/EnvLoader.cs +++ /dev/null @@ -1,77 +0,0 @@ -using Newsbot.Collector.Domain.Models; - -namespace Newsbot.Collector.Services; - -public static class EnvLoader -{ - - public static ConfigModel Load() - { - var reddit = new RedditConfigModel - { - IsEnabled = Bool("FEATURE_ENABLE_REDDIT_BACKEND"), - PullHot = Bool("REDDIT_PULL_HOT"), - PullNsfw = Bool("REDDIT_PULL_NSFW"), - PullTop = Bool("REDDIT_PULL_TOP") - }; - - return new ConfigModel - { - ServerAddress = String("SERVER_ADDRESS"), - SqlConnectionString = String("SQL_CONNECTION_STRING"), - Reddit = reddit, - }; - } - - public static void LoadEnvFile() - { - var curDir = Directory.GetCurrentDirectory(); - var filePath = Path.Combine(curDir, ".env"); - - if (!File.Exists(filePath)) - return; - - foreach (var line in File.ReadAllLines(filePath)) - { - var parts = line.Split('=', StringSplitOptions.RemoveEmptyEntries); - - if (parts.Length != 2) - continue; - - if (parts[1].Contains("'") == true ){ - parts[1] = parts[1].Replace("'", ""); - } - - Environment.SetEnvironmentVariable(parts[0], parts[1]); - } -} - -private static string String(string Key) -{ - var result = Environment.GetEnvironmentVariable(Key); - if (result is null) - { - return ""; - } - - return result; -} - -private static bool Bool(string Key) -{ - var result = String(Key); - if (result == "") - { - return false; - } - - if (result.ToLower() == "true") - { - return true; - } - else - { - return false; - } -} -} \ No newline at end of file diff --git a/Newsbot.Collector.Services/HtmlMeta.cs b/Newsbot.Collector.Services/HtmlMeta.cs new file mode 100644 index 0000000..79b3c8d --- /dev/null +++ b/Newsbot.Collector.Services/HtmlMeta.cs @@ -0,0 +1,129 @@ +using System.Data; +using System.Runtime.Serialization; +using System.Xml; +using HtmlAgilityPack; + +namespace Newsbot.Collector.Services; + +public class HtmlData +{ + public HtmlHeaderData Header { get; set; } = new HtmlHeaderData(); +} + +public class HtmlHeaderData +{ + public HtmlMetaData Meta { get; set; } = new HtmlMetaData(); +} + +public class HtmlMetaData +{ + public string Title { get; set; } = ""; + public string Description { get; set; } = ""; + public string Image { get; set; } = ""; + public string Url { get; set; } = ""; + public string PageType { get; set; } = ""; + //public string Color { get; set; } +} + +public class HtmlPageReader +{ + + public HtmlData Data { get; set; } + + private const string XPathMetaTag = "//head/meta"; + + private string _siteContent; + + public HtmlPageReader(string pageUrl) + { + _siteContent = ReadSiteContent(pageUrl); + var tags = CollectMetaTags(); + + Data = new HtmlData(); + Data.Header.Meta.Title = GetMetaTitle(); + Data.Header.Meta.Description = GetDescription(); + Data.Header.Meta.Image = GetImage(); + Data.Header.Meta.Url = GetUrl(); + Data.Header.Meta.PageType = GetPageType(); + } + + private string ReadSiteContent(string url) + { + using var client = new HttpClient(); + var html = client.GetStringAsync(url); + html.Wait(); + + var content = html.Result.ToString(); + return content; + } + + private List CollectMetaTags() + { + var htmlDoc = new HtmlDocument(); + htmlDoc.LoadHtml(_siteContent); + + var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList(); + + return tags; + } + + public string GetTagValue(string Tag) + { + var tags = CollectMetaTags(); + + foreach (var meta in tags) + { + //Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}"); + if (meta.Attributes[0].Value.Contains(Tag) == false) + { + continue; + } + return meta.Attributes[1].Value; + } + return ""; + } + + private string FindFirstResult(string[] tags) + { + foreach (var tag in tags) + { + var res = GetTagValue(tag); + if (res == "") + { + continue; + } + return res; + } + return ""; + } + + public string GetMetaTitle() + { + string[] tags = new string[] { "og:title", "twitter:title", "title" }; + return FindFirstResult(tags); + } + + public string GetDescription() + { + string[] tags = new string[] { "description", "og:description" }; + return FindFirstResult(tags); + } + + public string GetImage() + { + string[] tags = new string[] { "image", "og:image", "twitter:image" }; + return FindFirstResult(tags); + } + + public string GetUrl() + { + string[] tags = new string[] { "url", "og:url", "twitter:url" }; + return FindFirstResult(tags); + } + + public string GetPageType() + { + string[] tags = new string[] { "og:type", "type" }; + return FindFirstResult(tags); + } +} \ No newline at end of file diff --git a/Newsbot.Collector.Services/Jobs/Factory.cs b/Newsbot.Collector.Services/Jobs/Factory.cs deleted file mode 100644 index e69de29..0000000 diff --git a/Newsbot.Collector.Services/Jobs/HelloWorldJob.cs b/Newsbot.Collector.Services/Jobs/HelloWorldJob.cs index 8ff4d9d..e305568 100644 --- a/Newsbot.Collector.Services/Jobs/HelloWorldJob.cs +++ b/Newsbot.Collector.Services/Jobs/HelloWorldJob.cs @@ -4,13 +4,18 @@ namespace Newsbot.Collector.Services.Jobs; public class HelloWorldJob { - public readonly string _message; + public string _message { get; set; } public HelloWorldJob(string message) { _message = message; } + public void SetMessage(string message) + { + _message = message; + } + public void Execute() { Console.WriteLine(_message); diff --git a/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs b/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs new file mode 100644 index 0000000..fcbbc74 --- /dev/null +++ b/Newsbot.Collector.Services/Jobs/RssWatcherJob.cs @@ -0,0 +1,66 @@ +using System.ServiceModel.Syndication; +using System.Xml; +using Newsbot.Collector.Domain.Interfaces; +using Newsbot.Collector.Domain.Models; + +namespace Newsbot.Collector.Services.Jobs; + +public class RssWatcherJob : ICollector +{ + + private string? _url; + + public RssWatcherJob(string url) + { + _url = url; + } + + public List Collect() + { + var CollectedPosts = new List(); + + if (_url is null) + { + _url = ""; + } + + using var reader = XmlReader.Create(_url); + var feed = SyndicationFeed.Load(reader); + var posts = feed.Items.ToList(); + + foreach (var post in posts) + { + var url = post.Links[0].Uri.AbsoluteUri; + + // Check if we have seen the url before + // If we have, skip and save the site bandwidth + + var meta = new HtmlPageReader(url); + + var article = new ArticlesModel + { + Title = post.Title.Text, + Tags = FetchTags(post), + URL = post.Links[0].Uri.ToString(), + PubDate = post.PublishDate.DateTime, + Thumbnail = meta.Data.Header.Meta.Image, + Description = meta.Data.Header.Meta.Description, + }; + CollectedPosts.Add(article); + + // try to not be too greedy + Thread.Sleep(3000); + } + return CollectedPosts; + } + + private string FetchTags(SyndicationItem post) + { + string result = ""; + foreach (var tag in post.Categories) + { + result += $"{tag.Name},"; + } + return result; + } +} \ No newline at end of file diff --git a/Newsbot.Collector.Services/Newsbot.Collector.Services.csproj b/Newsbot.Collector.Services/Newsbot.Collector.Services.csproj index f793c4e..36f4513 100644 --- a/Newsbot.Collector.Services/Newsbot.Collector.Services.csproj +++ b/Newsbot.Collector.Services/Newsbot.Collector.Services.csproj @@ -2,10 +2,12 @@ + - + + diff --git a/Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs b/Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs new file mode 100644 index 0000000..1f24394 --- /dev/null +++ b/Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs @@ -0,0 +1,14 @@ +using Newsbot.Collector.Services.Jobs; + +namespace Newsbot.Collector.Tests.Jobs; + +public class RssWatcherJobTest +{ + [Fact] + public void CanFindItems() + { + var url = "https://www.engadget.com/rss.xml"; + var client = new RssWatcherJob(url); + var items = client.Collect(); + } +} \ No newline at end of file diff --git a/Newsbot.Collector.Tests/Newsbot.Collector.Tests.csproj b/Newsbot.Collector.Tests/Newsbot.Collector.Tests.csproj index 86a36ef..759ae6c 100644 --- a/Newsbot.Collector.Tests/Newsbot.Collector.Tests.csproj +++ b/Newsbot.Collector.Tests/Newsbot.Collector.Tests.csproj @@ -21,4 +21,9 @@ + + + + + diff --git a/Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs b/Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs new file mode 100644 index 0000000..80de049 --- /dev/null +++ b/Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs @@ -0,0 +1,40 @@ +using Newsbot.Collector.Database.Repositories; +using Newsbot.Collector.Domain.Models; + +namespace Newsbot.Collector.Tests.Tables; + +public class ArticlesTableTests +{ + + [Fact] + public void ArticlesListTest() + { + var client = new ArticlesTable(""); + client.List(); + } + + [Fact] + public void GetByIDTest() + { + var uid = Guid.Parse("4ac46772-253c-4c3d-8a2c-29239abd2ad4"); + + var client = new ArticlesTable(""); + var res = client.GetById(uid); + if (!res.ID.Equals(uid)) + { + Assert.Fail("Incorrect record or not found"); + } + } + + [Fact] + public void NewRecordTest() + { + var client = new ArticlesTable(""); + client.New(new ArticlesModel + { + Title = "Unit Testing!", + SourceID = Guid.NewGuid(), + PubDate = DateTime.Now + }); + } +} \ No newline at end of file diff --git a/Newsbot.Collector.Tests/Tables/SettingsTableTests.cs b/Newsbot.Collector.Tests/Tables/SettingsTableTests.cs new file mode 100644 index 0000000..1f49ee5 --- /dev/null +++ b/Newsbot.Collector.Tests/Tables/SettingsTableTests.cs @@ -0,0 +1,19 @@ +using Newsbot.Collector.Database.Repositories; +using Newsbot.Collector.Domain.Models; + +namespace Newsbot.Collector.Tests.Tables; + +public class SettingsTableTests +{ + [Fact] + public void New() + { + var client = new SettingsTable(""); + client.New(new SettingModel + { + Key = "Unit Testing", + Value = "Unit", + Options = "" + }); + } +} \ No newline at end of file