Features/html meta extractor (#4)

* gave api access to the db project

* added db models

* working on rss extraction and meta extraction

* test project to debug rsswatcherjob

* added new configs for the project

* new interface to define collectors

* basic rss extraction and article details are now exposed

* tests updated for rss pull

* starting to get dapper working.  Query works but insert seems to have a value issue

* removed dapper from services

* added some basic tests for db calls
This commit is contained in:
James Tombleson 2023-02-16 22:19:05 -08:00 committed by GitHub
parent 9f5772551c
commit 9f3a6323a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 412 additions and 85 deletions

View File

@ -1,6 +1,8 @@
{
"files.exclude": {
"**/obj": true,
"**/bin": true
}
"**/bin": false
},
"csharp.inlayHints.types.enabled": true,
"omnisharp.enableImportCompletion": true
}

View File

@ -16,6 +16,7 @@
<ItemGroup>
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
<ProjectReference Include="..\Newsbot.Collector.Services\Newsbot.Collector.Services.csproj" />
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
</ItemGroup>
</Project>

View File

@ -1,6 +1,6 @@
using Hangfire;
using Hangfire.MemoryStorage;
using Newsbot.Collector.Services;
using Newsbot.Collector.Services.Jobs;
using Newsbot.Collector.Domain.Models;
var builder = WebApplication.CreateBuilder(args);
@ -35,7 +35,7 @@ if (app.Environment.IsDevelopment())
app.UseHttpsRedirection();
app.UseHangfireDashboard();
//RecurringJob.AddOrUpdate()
RecurringJob.AddOrUpdate<HelloWorldJob>("Example", x => x.Execute(), "0/2 * * * *");
app.UseAuthorization();

View File

@ -4,6 +4,11 @@
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="dapper" Version="2.0.123" />
<PackageReference Include="Npgsql" Version="7.0.2" />
</ItemGroup>
<PropertyGroup>
<TargetFramework>net7.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>

View File

@ -0,0 +1,79 @@
using System.Data;
using Dapper;
using Newsbot.Collector.Domain.Models;
using Npgsql;
namespace Newsbot.Collector.Database.Repositories;
public class ArticlesTable
{
private string _connectionString;
public ArticlesTable(string connectionString)
{
_connectionString = connectionString;
}
public static IDbConnection OpenConnection(string connectionString)
{
var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable";
var conn = new NpgsqlConnection(cs);
conn.Open();
return conn;
}
public List<ArticlesModel> List(int Page = 0, int Count = 25)
{
using var conn = OpenConnection(_connectionString);
var res = conn.Query<ArticlesModel>(@"select * from articles
Order By PubDate Desc
Offset @Page
Fetch Next @Count Rows Only", new { Page = Page * Count, Count = Count }).ToList();
return res;
}
public ArticlesModel GetById(Guid ID)
{
using var conn = OpenConnection(_connectionString);
var res = conn.Query<ArticlesModel>("select * from articles where ID = @ID", new { ID = ID });
return res.First();
}
public ArticlesModel GetByUrl(string url)
{
using var conn = OpenConnection(_connectionString);
var res = conn.Query<ArticlesModel>("select * from articles where Url = @Url Limit 1", new { Url = url });
return res.First();
}
public void New(ArticlesModel model)
{
model.ID = Guid.NewGuid();
using var conn = OpenConnection(_connectionString);
var q = @"INSERT INTO Articles
(ID, SourceId, Tags, Title, Url, PubDate, Video, VideoHeight, VideoWidth, Thumbnail, Description, AuthorName, AuthorImage)
Values
(@Id, @SourceId, @Tags, @Title, @Url, @PubDate, @Video, @VideoHeight, @VideoWidth, @Thumbnail, @Description, @AuthorName, @AuthorImage);
";
var res = conn.Execute(q, model);
//new{
// Id = Guid.NewGuid(),
// SourceId = model.SourceID,
// Tags = model.Tags,
// Title = model.Title,
// Url = model.URL,
// PubDate = model.PubDate,
// Video = model.Video,
// VideoHeight = model.VideoHeight,
// VideoWidth = model.VideoWidth,
// Thumbnail = model.Thumbnail,
// Description = model.Description,
// AuthorName = model.AuthorName,
// AuthorImage = model.AuthorImage
//});
Console.WriteLine(res);
}
}

View File

@ -0,0 +1,34 @@
using System.Data;
using Dapper;
using Newsbot.Collector.Domain.Models;
using Npgsql;
namespace Newsbot.Collector.Database.Repositories;
public class SettingsTable
{
private string _connectionString;
public SettingsTable(string connectionString)
{
_connectionString = connectionString;
}
public static IDbConnection OpenConnection(string connectionString)
{
var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable";
var conn = new NpgsqlConnection(cs);
conn.Open();
return conn;
}
public void New(SettingModel model)
{
model.ID = Guid.NewGuid();
using var conn = OpenConnection(_connectionString);
var q = @"Insert Into Settings (ID, Key, Value, OPTIONS) Values (@ID,@Key,@Value,@Options)";
conn.Execute(q, model);
}
}

View File

@ -1,6 +1,8 @@
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Domain.Interfaces;
public interface ICollector
{
void Collect();
List<ArticlesModel> Collect();
}

View File

@ -5,6 +5,7 @@ public class ArticlesModel
public Guid ID { get; set; }
public Guid SourceID { get; set; }
public string Tags { get; set; } = "";
public string Title { get; set; } = "";
public string URL { get; set; } = "";
public DateTime PubDate { get; set; }
public string Video { get; set; } = "";
@ -72,5 +73,5 @@ public class SubscriptionModel
{
public Guid ID { get; set; }
public Guid DiscordWebHookID { get; set; }
public Guid SourceID { get; set;}
public Guid SourceID { get; set; }
}

View File

@ -1,77 +0,0 @@
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Services;
public static class EnvLoader
{
public static ConfigModel Load()
{
var reddit = new RedditConfigModel
{
IsEnabled = Bool("FEATURE_ENABLE_REDDIT_BACKEND"),
PullHot = Bool("REDDIT_PULL_HOT"),
PullNsfw = Bool("REDDIT_PULL_NSFW"),
PullTop = Bool("REDDIT_PULL_TOP")
};
return new ConfigModel
{
ServerAddress = String("SERVER_ADDRESS"),
SqlConnectionString = String("SQL_CONNECTION_STRING"),
Reddit = reddit,
};
}
public static void LoadEnvFile()
{
var curDir = Directory.GetCurrentDirectory();
var filePath = Path.Combine(curDir, ".env");
if (!File.Exists(filePath))
return;
foreach (var line in File.ReadAllLines(filePath))
{
var parts = line.Split('=', StringSplitOptions.RemoveEmptyEntries);
if (parts.Length != 2)
continue;
if (parts[1].Contains("'") == true ){
parts[1] = parts[1].Replace("'", "");
}
Environment.SetEnvironmentVariable(parts[0], parts[1]);
}
}
private static string String(string Key)
{
var result = Environment.GetEnvironmentVariable(Key);
if (result is null)
{
return "";
}
return result;
}
private static bool Bool(string Key)
{
var result = String(Key);
if (result == "")
{
return false;
}
if (result.ToLower() == "true")
{
return true;
}
else
{
return false;
}
}
}

View File

@ -0,0 +1,129 @@
using System.Data;
using System.Runtime.Serialization;
using System.Xml;
using HtmlAgilityPack;
namespace Newsbot.Collector.Services;
public class HtmlData
{
public HtmlHeaderData Header { get; set; } = new HtmlHeaderData();
}
public class HtmlHeaderData
{
public HtmlMetaData Meta { get; set; } = new HtmlMetaData();
}
public class HtmlMetaData
{
public string Title { get; set; } = "";
public string Description { get; set; } = "";
public string Image { get; set; } = "";
public string Url { get; set; } = "";
public string PageType { get; set; } = "";
//public string Color { get; set; }
}
public class HtmlPageReader
{
public HtmlData Data { get; set; }
private const string XPathMetaTag = "//head/meta";
private string _siteContent;
public HtmlPageReader(string pageUrl)
{
_siteContent = ReadSiteContent(pageUrl);
var tags = CollectMetaTags();
Data = new HtmlData();
Data.Header.Meta.Title = GetMetaTitle();
Data.Header.Meta.Description = GetDescription();
Data.Header.Meta.Image = GetImage();
Data.Header.Meta.Url = GetUrl();
Data.Header.Meta.PageType = GetPageType();
}
private string ReadSiteContent(string url)
{
using var client = new HttpClient();
var html = client.GetStringAsync(url);
html.Wait();
var content = html.Result.ToString();
return content;
}
private List<HtmlNode> CollectMetaTags()
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(_siteContent);
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
return tags;
}
public string GetTagValue(string Tag)
{
var tags = CollectMetaTags();
foreach (var meta in tags)
{
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
if (meta.Attributes[0].Value.Contains(Tag) == false)
{
continue;
}
return meta.Attributes[1].Value;
}
return "";
}
private string FindFirstResult(string[] tags)
{
foreach (var tag in tags)
{
var res = GetTagValue(tag);
if (res == "")
{
continue;
}
return res;
}
return "";
}
public string GetMetaTitle()
{
string[] tags = new string[] { "og:title", "twitter:title", "title" };
return FindFirstResult(tags);
}
public string GetDescription()
{
string[] tags = new string[] { "description", "og:description" };
return FindFirstResult(tags);
}
public string GetImage()
{
string[] tags = new string[] { "image", "og:image", "twitter:image" };
return FindFirstResult(tags);
}
public string GetUrl()
{
string[] tags = new string[] { "url", "og:url", "twitter:url" };
return FindFirstResult(tags);
}
public string GetPageType()
{
string[] tags = new string[] { "og:type", "type" };
return FindFirstResult(tags);
}
}

View File

@ -4,13 +4,18 @@ namespace Newsbot.Collector.Services.Jobs;
public class HelloWorldJob
{
public readonly string _message;
public string _message { get; set; }
public HelloWorldJob(string message)
{
_message = message;
}
public void SetMessage(string message)
{
_message = message;
}
public void Execute()
{
Console.WriteLine(_message);

View File

@ -0,0 +1,66 @@
using System.ServiceModel.Syndication;
using System.Xml;
using Newsbot.Collector.Domain.Interfaces;
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Services.Jobs;
public class RssWatcherJob : ICollector
{
private string? _url;
public RssWatcherJob(string url)
{
_url = url;
}
public List<ArticlesModel> Collect()
{
var CollectedPosts = new List<ArticlesModel>();
if (_url is null)
{
_url = "";
}
using var reader = XmlReader.Create(_url);
var feed = SyndicationFeed.Load(reader);
var posts = feed.Items.ToList();
foreach (var post in posts)
{
var url = post.Links[0].Uri.AbsoluteUri;
// Check if we have seen the url before
// If we have, skip and save the site bandwidth
var meta = new HtmlPageReader(url);
var article = new ArticlesModel
{
Title = post.Title.Text,
Tags = FetchTags(post),
URL = post.Links[0].Uri.ToString(),
PubDate = post.PublishDate.DateTime,
Thumbnail = meta.Data.Header.Meta.Image,
Description = meta.Data.Header.Meta.Description,
};
CollectedPosts.Add(article);
// try to not be too greedy
Thread.Sleep(3000);
}
return CollectedPosts;
}
private string FetchTags(SyndicationItem post)
{
string result = "";
foreach (var tag in post.Categories)
{
result += $"{tag.Name},";
}
return result;
}
}

View File

@ -2,10 +2,12 @@
<ItemGroup>
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Npgsql" Version="7.0.1" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.46" />
<PackageReference Include="System.ServiceModel.Syndication" Version="7.0.0" />
</ItemGroup>
<PropertyGroup>

View File

@ -0,0 +1,14 @@
using Newsbot.Collector.Services.Jobs;
namespace Newsbot.Collector.Tests.Jobs;
public class RssWatcherJobTest
{
[Fact]
public void CanFindItems()
{
var url = "https://www.engadget.com/rss.xml";
var client = new RssWatcherJob(url);
var items = client.Collect();
}
}

View File

@ -21,4 +21,9 @@
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Newsbot.Collector.Services\Newsbot.Collector.Services.csproj" />
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,40 @@
using Newsbot.Collector.Database.Repositories;
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Tests.Tables;
public class ArticlesTableTests
{
[Fact]
public void ArticlesListTest()
{
var client = new ArticlesTable("");
client.List();
}
[Fact]
public void GetByIDTest()
{
var uid = Guid.Parse("4ac46772-253c-4c3d-8a2c-29239abd2ad4");
var client = new ArticlesTable("");
var res = client.GetById(uid);
if (!res.ID.Equals(uid))
{
Assert.Fail("Incorrect record or not found");
}
}
[Fact]
public void NewRecordTest()
{
var client = new ArticlesTable("");
client.New(new ArticlesModel
{
Title = "Unit Testing!",
SourceID = Guid.NewGuid(),
PubDate = DateTime.Now
});
}
}

View File

@ -0,0 +1,19 @@
using Newsbot.Collector.Database.Repositories;
using Newsbot.Collector.Domain.Models;
namespace Newsbot.Collector.Tests.Tables;
public class SettingsTableTests
{
[Fact]
public void New()
{
var client = new SettingsTable("");
client.New(new SettingModel
{
Key = "Unit Testing",
Value = "Unit",
Options = ""
});
}
}