Features/html meta extractor (#4)
* gave api access to the db project * added db models * working on rss extraction and meta extraction * test project to debug rsswatcherjob * added new configs for the project * new interface to define collectors * basic rss extraction and article details are now exposed * tests updated for rss pull * starting to get dapper working. Query works but insert seems to have a value issue * removed dapper from services * added some basic tests for db calls
This commit is contained in:
parent
9f5772551c
commit
9f3a6323a6
6
.vscode/settings.json
vendored
6
.vscode/settings.json
vendored
@ -1,6 +1,8 @@
|
||||
{
|
||||
"files.exclude": {
|
||||
"**/obj": true,
|
||||
"**/bin": true
|
||||
}
|
||||
"**/bin": false
|
||||
},
|
||||
"csharp.inlayHints.types.enabled": true,
|
||||
"omnisharp.enableImportCompletion": true
|
||||
}
|
@ -16,6 +16,7 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
|
||||
<ProjectReference Include="..\Newsbot.Collector.Services\Newsbot.Collector.Services.csproj" />
|
||||
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
@ -1,6 +1,6 @@
|
||||
using Hangfire;
|
||||
using Hangfire.MemoryStorage;
|
||||
using Newsbot.Collector.Services;
|
||||
using Newsbot.Collector.Services.Jobs;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
@ -35,7 +35,7 @@ if (app.Environment.IsDevelopment())
|
||||
app.UseHttpsRedirection();
|
||||
|
||||
app.UseHangfireDashboard();
|
||||
//RecurringJob.AddOrUpdate()
|
||||
RecurringJob.AddOrUpdate<HelloWorldJob>("Example", x => x.Execute(), "0/2 * * * *");
|
||||
|
||||
app.UseAuthorization();
|
||||
|
||||
|
@ -4,6 +4,11 @@
|
||||
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="dapper" Version="2.0.123" />
|
||||
<PackageReference Include="Npgsql" Version="7.0.2" />
|
||||
</ItemGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net7.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
|
79
Newsbot.Collector.Database/Repositories/ArticlesTable.cs
Normal file
79
Newsbot.Collector.Database/Repositories/ArticlesTable.cs
Normal file
@ -0,0 +1,79 @@
|
||||
using System.Data;
|
||||
using Dapper;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
using Npgsql;
|
||||
|
||||
namespace Newsbot.Collector.Database.Repositories;
|
||||
|
||||
public class ArticlesTable
|
||||
{
|
||||
|
||||
private string _connectionString;
|
||||
|
||||
public ArticlesTable(string connectionString)
|
||||
{
|
||||
_connectionString = connectionString;
|
||||
}
|
||||
|
||||
public static IDbConnection OpenConnection(string connectionString)
|
||||
{
|
||||
var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable";
|
||||
var conn = new NpgsqlConnection(cs);
|
||||
conn.Open();
|
||||
return conn;
|
||||
}
|
||||
|
||||
public List<ArticlesModel> List(int Page = 0, int Count = 25)
|
||||
{
|
||||
using var conn = OpenConnection(_connectionString);
|
||||
var res = conn.Query<ArticlesModel>(@"select * from articles
|
||||
Order By PubDate Desc
|
||||
Offset @Page
|
||||
Fetch Next @Count Rows Only", new { Page = Page * Count, Count = Count }).ToList();
|
||||
return res;
|
||||
}
|
||||
|
||||
public ArticlesModel GetById(Guid ID)
|
||||
{
|
||||
using var conn = OpenConnection(_connectionString);
|
||||
var res = conn.Query<ArticlesModel>("select * from articles where ID = @ID", new { ID = ID });
|
||||
return res.First();
|
||||
}
|
||||
|
||||
public ArticlesModel GetByUrl(string url)
|
||||
{
|
||||
using var conn = OpenConnection(_connectionString);
|
||||
var res = conn.Query<ArticlesModel>("select * from articles where Url = @Url Limit 1", new { Url = url });
|
||||
return res.First();
|
||||
}
|
||||
|
||||
public void New(ArticlesModel model)
|
||||
{
|
||||
model.ID = Guid.NewGuid();
|
||||
|
||||
using var conn = OpenConnection(_connectionString);
|
||||
var q = @"INSERT INTO Articles
|
||||
(ID, SourceId, Tags, Title, Url, PubDate, Video, VideoHeight, VideoWidth, Thumbnail, Description, AuthorName, AuthorImage)
|
||||
Values
|
||||
(@Id, @SourceId, @Tags, @Title, @Url, @PubDate, @Video, @VideoHeight, @VideoWidth, @Thumbnail, @Description, @AuthorName, @AuthorImage);
|
||||
";
|
||||
var res = conn.Execute(q, model);
|
||||
//new{
|
||||
// Id = Guid.NewGuid(),
|
||||
// SourceId = model.SourceID,
|
||||
// Tags = model.Tags,
|
||||
// Title = model.Title,
|
||||
// Url = model.URL,
|
||||
// PubDate = model.PubDate,
|
||||
// Video = model.Video,
|
||||
// VideoHeight = model.VideoHeight,
|
||||
// VideoWidth = model.VideoWidth,
|
||||
// Thumbnail = model.Thumbnail,
|
||||
// Description = model.Description,
|
||||
// AuthorName = model.AuthorName,
|
||||
// AuthorImage = model.AuthorImage
|
||||
//});
|
||||
Console.WriteLine(res);
|
||||
}
|
||||
|
||||
}
|
34
Newsbot.Collector.Database/Repositories/SettingsTable.cs
Normal file
34
Newsbot.Collector.Database/Repositories/SettingsTable.cs
Normal file
@ -0,0 +1,34 @@
|
||||
using System.Data;
|
||||
using Dapper;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
using Npgsql;
|
||||
|
||||
namespace Newsbot.Collector.Database.Repositories;
|
||||
|
||||
public class SettingsTable
|
||||
{
|
||||
|
||||
private string _connectionString;
|
||||
|
||||
public SettingsTable(string connectionString)
|
||||
{
|
||||
_connectionString = connectionString;
|
||||
}
|
||||
|
||||
public static IDbConnection OpenConnection(string connectionString)
|
||||
{
|
||||
var cs = "Host=localhost;Username=postgres;Password=postgres;Database=postgres;sslmode=disable";
|
||||
var conn = new NpgsqlConnection(cs);
|
||||
conn.Open();
|
||||
return conn;
|
||||
}
|
||||
|
||||
public void New(SettingModel model)
|
||||
{
|
||||
model.ID = Guid.NewGuid();
|
||||
|
||||
using var conn = OpenConnection(_connectionString);
|
||||
var q = @"Insert Into Settings (ID, Key, Value, OPTIONS) Values (@ID,@Key,@Value,@Options)";
|
||||
conn.Execute(q, model);
|
||||
}
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
namespace Newsbot.Collector.Domain.Interfaces;
|
||||
|
||||
public interface ICollector
|
||||
{
|
||||
void Collect();
|
||||
List<ArticlesModel> Collect();
|
||||
}
|
@ -5,6 +5,7 @@ public class ArticlesModel
|
||||
public Guid ID { get; set; }
|
||||
public Guid SourceID { get; set; }
|
||||
public string Tags { get; set; } = "";
|
||||
public string Title { get; set; } = "";
|
||||
public string URL { get; set; } = "";
|
||||
public DateTime PubDate { get; set; }
|
||||
public string Video { get; set; } = "";
|
||||
|
@ -1,77 +0,0 @@
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
namespace Newsbot.Collector.Services;
|
||||
|
||||
public static class EnvLoader
|
||||
{
|
||||
|
||||
public static ConfigModel Load()
|
||||
{
|
||||
var reddit = new RedditConfigModel
|
||||
{
|
||||
IsEnabled = Bool("FEATURE_ENABLE_REDDIT_BACKEND"),
|
||||
PullHot = Bool("REDDIT_PULL_HOT"),
|
||||
PullNsfw = Bool("REDDIT_PULL_NSFW"),
|
||||
PullTop = Bool("REDDIT_PULL_TOP")
|
||||
};
|
||||
|
||||
return new ConfigModel
|
||||
{
|
||||
ServerAddress = String("SERVER_ADDRESS"),
|
||||
SqlConnectionString = String("SQL_CONNECTION_STRING"),
|
||||
Reddit = reddit,
|
||||
};
|
||||
}
|
||||
|
||||
public static void LoadEnvFile()
|
||||
{
|
||||
var curDir = Directory.GetCurrentDirectory();
|
||||
var filePath = Path.Combine(curDir, ".env");
|
||||
|
||||
if (!File.Exists(filePath))
|
||||
return;
|
||||
|
||||
foreach (var line in File.ReadAllLines(filePath))
|
||||
{
|
||||
var parts = line.Split('=', StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
if (parts.Length != 2)
|
||||
continue;
|
||||
|
||||
if (parts[1].Contains("'") == true ){
|
||||
parts[1] = parts[1].Replace("'", "");
|
||||
}
|
||||
|
||||
Environment.SetEnvironmentVariable(parts[0], parts[1]);
|
||||
}
|
||||
}
|
||||
|
||||
private static string String(string Key)
|
||||
{
|
||||
var result = Environment.GetEnvironmentVariable(Key);
|
||||
if (result is null)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static bool Bool(string Key)
|
||||
{
|
||||
var result = String(Key);
|
||||
if (result == "")
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result.ToLower() == "true")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
129
Newsbot.Collector.Services/HtmlMeta.cs
Normal file
129
Newsbot.Collector.Services/HtmlMeta.cs
Normal file
@ -0,0 +1,129 @@
|
||||
using System.Data;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Xml;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace Newsbot.Collector.Services;
|
||||
|
||||
public class HtmlData
|
||||
{
|
||||
public HtmlHeaderData Header { get; set; } = new HtmlHeaderData();
|
||||
}
|
||||
|
||||
public class HtmlHeaderData
|
||||
{
|
||||
public HtmlMetaData Meta { get; set; } = new HtmlMetaData();
|
||||
}
|
||||
|
||||
public class HtmlMetaData
|
||||
{
|
||||
public string Title { get; set; } = "";
|
||||
public string Description { get; set; } = "";
|
||||
public string Image { get; set; } = "";
|
||||
public string Url { get; set; } = "";
|
||||
public string PageType { get; set; } = "";
|
||||
//public string Color { get; set; }
|
||||
}
|
||||
|
||||
public class HtmlPageReader
|
||||
{
|
||||
|
||||
public HtmlData Data { get; set; }
|
||||
|
||||
private const string XPathMetaTag = "//head/meta";
|
||||
|
||||
private string _siteContent;
|
||||
|
||||
public HtmlPageReader(string pageUrl)
|
||||
{
|
||||
_siteContent = ReadSiteContent(pageUrl);
|
||||
var tags = CollectMetaTags();
|
||||
|
||||
Data = new HtmlData();
|
||||
Data.Header.Meta.Title = GetMetaTitle();
|
||||
Data.Header.Meta.Description = GetDescription();
|
||||
Data.Header.Meta.Image = GetImage();
|
||||
Data.Header.Meta.Url = GetUrl();
|
||||
Data.Header.Meta.PageType = GetPageType();
|
||||
}
|
||||
|
||||
private string ReadSiteContent(string url)
|
||||
{
|
||||
using var client = new HttpClient();
|
||||
var html = client.GetStringAsync(url);
|
||||
html.Wait();
|
||||
|
||||
var content = html.Result.ToString();
|
||||
return content;
|
||||
}
|
||||
|
||||
private List<HtmlNode> CollectMetaTags()
|
||||
{
|
||||
var htmlDoc = new HtmlDocument();
|
||||
htmlDoc.LoadHtml(_siteContent);
|
||||
|
||||
var tags = htmlDoc.DocumentNode.SelectNodes(XPathMetaTag).ToList();
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
public string GetTagValue(string Tag)
|
||||
{
|
||||
var tags = CollectMetaTags();
|
||||
|
||||
foreach (var meta in tags)
|
||||
{
|
||||
//Console.WriteLine($"Name={meta.Attributes[0].Name} & Value={meta.Attributes[0].Value}");
|
||||
if (meta.Attributes[0].Value.Contains(Tag) == false)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
return meta.Attributes[1].Value;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private string FindFirstResult(string[] tags)
|
||||
{
|
||||
foreach (var tag in tags)
|
||||
{
|
||||
var res = GetTagValue(tag);
|
||||
if (res == "")
|
||||
{
|
||||
continue;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public string GetMetaTitle()
|
||||
{
|
||||
string[] tags = new string[] { "og:title", "twitter:title", "title" };
|
||||
return FindFirstResult(tags);
|
||||
}
|
||||
|
||||
public string GetDescription()
|
||||
{
|
||||
string[] tags = new string[] { "description", "og:description" };
|
||||
return FindFirstResult(tags);
|
||||
}
|
||||
|
||||
public string GetImage()
|
||||
{
|
||||
string[] tags = new string[] { "image", "og:image", "twitter:image" };
|
||||
return FindFirstResult(tags);
|
||||
}
|
||||
|
||||
public string GetUrl()
|
||||
{
|
||||
string[] tags = new string[] { "url", "og:url", "twitter:url" };
|
||||
return FindFirstResult(tags);
|
||||
}
|
||||
|
||||
public string GetPageType()
|
||||
{
|
||||
string[] tags = new string[] { "og:type", "type" };
|
||||
return FindFirstResult(tags);
|
||||
}
|
||||
}
|
@ -4,13 +4,18 @@ namespace Newsbot.Collector.Services.Jobs;
|
||||
public class HelloWorldJob
|
||||
{
|
||||
|
||||
public readonly string _message;
|
||||
public string _message { get; set; }
|
||||
|
||||
public HelloWorldJob(string message)
|
||||
{
|
||||
_message = message;
|
||||
}
|
||||
|
||||
public void SetMessage(string message)
|
||||
{
|
||||
_message = message;
|
||||
}
|
||||
|
||||
public void Execute()
|
||||
{
|
||||
Console.WriteLine(_message);
|
||||
|
66
Newsbot.Collector.Services/Jobs/RssWatcherJob.cs
Normal file
66
Newsbot.Collector.Services/Jobs/RssWatcherJob.cs
Normal file
@ -0,0 +1,66 @@
|
||||
using System.ServiceModel.Syndication;
|
||||
using System.Xml;
|
||||
using Newsbot.Collector.Domain.Interfaces;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
namespace Newsbot.Collector.Services.Jobs;
|
||||
|
||||
public class RssWatcherJob : ICollector
|
||||
{
|
||||
|
||||
private string? _url;
|
||||
|
||||
public RssWatcherJob(string url)
|
||||
{
|
||||
_url = url;
|
||||
}
|
||||
|
||||
public List<ArticlesModel> Collect()
|
||||
{
|
||||
var CollectedPosts = new List<ArticlesModel>();
|
||||
|
||||
if (_url is null)
|
||||
{
|
||||
_url = "";
|
||||
}
|
||||
|
||||
using var reader = XmlReader.Create(_url);
|
||||
var feed = SyndicationFeed.Load(reader);
|
||||
var posts = feed.Items.ToList();
|
||||
|
||||
foreach (var post in posts)
|
||||
{
|
||||
var url = post.Links[0].Uri.AbsoluteUri;
|
||||
|
||||
// Check if we have seen the url before
|
||||
// If we have, skip and save the site bandwidth
|
||||
|
||||
var meta = new HtmlPageReader(url);
|
||||
|
||||
var article = new ArticlesModel
|
||||
{
|
||||
Title = post.Title.Text,
|
||||
Tags = FetchTags(post),
|
||||
URL = post.Links[0].Uri.ToString(),
|
||||
PubDate = post.PublishDate.DateTime,
|
||||
Thumbnail = meta.Data.Header.Meta.Image,
|
||||
Description = meta.Data.Header.Meta.Description,
|
||||
};
|
||||
CollectedPosts.Add(article);
|
||||
|
||||
// try to not be too greedy
|
||||
Thread.Sleep(3000);
|
||||
}
|
||||
return CollectedPosts;
|
||||
}
|
||||
|
||||
private string FetchTags(SyndicationItem post)
|
||||
{
|
||||
string result = "";
|
||||
foreach (var tag in post.Categories)
|
||||
{
|
||||
result += $"{tag.Name},";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
@ -2,10 +2,12 @@
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Newsbot.Collector.Domain\Newsbot.Collector.Domain.csproj" />
|
||||
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Npgsql" Version="7.0.1" />
|
||||
<PackageReference Include="HtmlAgilityPack" Version="1.11.46" />
|
||||
<PackageReference Include="System.ServiceModel.Syndication" Version="7.0.0" />
|
||||
</ItemGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
|
14
Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs
Normal file
14
Newsbot.Collector.Tests/Jobs/RssWatcherJobTest.cs
Normal file
@ -0,0 +1,14 @@
|
||||
using Newsbot.Collector.Services.Jobs;
|
||||
|
||||
namespace Newsbot.Collector.Tests.Jobs;
|
||||
|
||||
public class RssWatcherJobTest
|
||||
{
|
||||
[Fact]
|
||||
public void CanFindItems()
|
||||
{
|
||||
var url = "https://www.engadget.com/rss.xml";
|
||||
var client = new RssWatcherJob(url);
|
||||
var items = client.Collect();
|
||||
}
|
||||
}
|
@ -21,4 +21,9 @@
|
||||
</PackageReference>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Newsbot.Collector.Services\Newsbot.Collector.Services.csproj" />
|
||||
<ProjectReference Include="..\Newsbot.Collector.Database\Newsbot.Collector.Database.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
40
Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs
Normal file
40
Newsbot.Collector.Tests/Tables/ArticlesTableTests.cs
Normal file
@ -0,0 +1,40 @@
|
||||
using Newsbot.Collector.Database.Repositories;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
namespace Newsbot.Collector.Tests.Tables;
|
||||
|
||||
public class ArticlesTableTests
|
||||
{
|
||||
|
||||
[Fact]
|
||||
public void ArticlesListTest()
|
||||
{
|
||||
var client = new ArticlesTable("");
|
||||
client.List();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetByIDTest()
|
||||
{
|
||||
var uid = Guid.Parse("4ac46772-253c-4c3d-8a2c-29239abd2ad4");
|
||||
|
||||
var client = new ArticlesTable("");
|
||||
var res = client.GetById(uid);
|
||||
if (!res.ID.Equals(uid))
|
||||
{
|
||||
Assert.Fail("Incorrect record or not found");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void NewRecordTest()
|
||||
{
|
||||
var client = new ArticlesTable("");
|
||||
client.New(new ArticlesModel
|
||||
{
|
||||
Title = "Unit Testing!",
|
||||
SourceID = Guid.NewGuid(),
|
||||
PubDate = DateTime.Now
|
||||
});
|
||||
}
|
||||
}
|
19
Newsbot.Collector.Tests/Tables/SettingsTableTests.cs
Normal file
19
Newsbot.Collector.Tests/Tables/SettingsTableTests.cs
Normal file
@ -0,0 +1,19 @@
|
||||
using Newsbot.Collector.Database.Repositories;
|
||||
using Newsbot.Collector.Domain.Models;
|
||||
|
||||
namespace Newsbot.Collector.Tests.Tables;
|
||||
|
||||
public class SettingsTableTests
|
||||
{
|
||||
[Fact]
|
||||
public void New()
|
||||
{
|
||||
var client = new SettingsTable("");
|
||||
client.New(new SettingModel
|
||||
{
|
||||
Key = "Unit Testing",
|
||||
Value = "Unit",
|
||||
Options = ""
|
||||
});
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user