diff --git a/database/articles.go b/database/articles.go index 5f125be..3ca7cec 100644 --- a/database/articles.go +++ b/database/articles.go @@ -84,10 +84,11 @@ func (ac *ArticlesClient) Add(item model.Articles) error { req.Header.Set("Content-Type", "application/json") resp, err := client.Do(req) - defer resp.Body.Close() if err != nil { return err } + defer resp.Body.Close() + if resp.StatusCode != 200 { return errors.New("failed to post to the DB") } diff --git a/database/common.go b/database/common.go index 7f555ce..1ea0c4e 100644 --- a/database/common.go +++ b/database/common.go @@ -6,7 +6,7 @@ import ( "log" "net/http" - "github.com/jtom38/newsbot/collector/services" + "github.com/jtom38/newsbot/collector/services/config" ) type DatabaseClient struct { @@ -18,8 +18,8 @@ type DatabaseClient struct { // This will generate a new client to interface with the API Database. func NewDatabaseClient() DatabaseClient { - cc := services.NewConfigClient() - dbUri := cc.GetConfig(services.DB_URI) + cc := config.New() + dbUri := cc.GetConfig(config.DB_URI) var client = DatabaseClient{} client.Diagnosis.rootUri = dbUri diff --git a/domain/interfaces/source.go b/domain/interfaces/source.go index db51f86..86248b7 100644 --- a/domain/interfaces/source.go +++ b/domain/interfaces/source.go @@ -1,10 +1,23 @@ package interfaces import ( + "github.com/go-rod/rod" "github.com/mmcdole/gofeed" ) type Sources interface { CheckSource() error PullFeed() (*gofeed.Feed, error) -} \ No newline at end of file + + GetBrowser() *rod.Browser + GetPage(parser *rod.Browser, url string) *rod.Page + + ExtractThumbnail(page *rod.Page) (string, error) + ExtractPubDate(page *rod.Page) (string, error) + ExtractDescription(page *rod.Page) (string, error) + ExtractAuthor(page *rod.Page) (string, error) + ExtractAuthorImage(page *rod.Page) (string, error) + ExtractTags(page *rod.Page) (string, error) + ExtractTitle(page *rod.Page) (string, error) +} + diff --git a/domain/model/cache.go b/domain/model/cache.go index 72f055d..3570e81 100644 --- a/domain/model/cache.go +++ b/domain/model/cache.go @@ -12,4 +12,5 @@ type CacheItem struct { // youtube, reddit, ect Group string Expires time.Time + IsTainted bool } \ No newline at end of file diff --git a/go.mod b/go.mod index 298cf0b..27347cb 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.0 // indirect github.com/go-chi/chi/v5 v5.0.7 // indirect github.com/go-rod/rod v0.105.1 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/joho/godotenv v1.4.0 // indirect github.com/mmcdole/gofeed v1.1.3 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 02c5d33..9ebe7f4 100644 --- a/go.sum +++ b/go.sum @@ -49,6 +49,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= diff --git a/main.go b/main.go index e3039e3..315c714 100644 --- a/main.go +++ b/main.go @@ -1,28 +1,21 @@ package main import ( - //"fmt" "log" "net/http" - "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/jtom38/newsbot/collector/routes" - "github.com/jtom38/newsbot/collector/database" - "github.com/jtom38/newsbot/collector/services" ) func main() { - var err error - //EnableScheduler() //dc := database.NewDatabaseClient() //err := dc.Diagnosis.Ping() //if err != nil { log.Fatalln(err) } - //CheckReddit() - CheckYoutube() + EnableScheduler() app := chi.NewRouter() app.Use(middleware.Logger) @@ -34,34 +27,6 @@ func main() { log.Println("API is online and waiting for requests.") log.Println("API: http://localhost:8081/api") //log.Println("Swagger: http://localhost:8080/swagger/index.html") - err = http.ListenAndServe(":8081", app) + err := http.ListenAndServe(":8081", app) if err != nil { log.Fatalln(err) } -} - -func CheckReddit() { - dc := database.NewDatabaseClient() - sources, err := dc.Sources.FindBySource("reddit") - if err != nil { log.Println(err) } - - rc := services.NewRedditClient(sources[0].Name, sources[0].ID) - raw, err := rc.GetContent() - if err != nil { log.Println(err) } - - redditArticles := rc.ConvertToArticles(raw) - - for _, item := range redditArticles { - _, err = dc.Articles.FindByUrl(item.Url) - if err != nil { - err = dc.Articles.Add(item) - if err != nil { log.Println("Failed to post article.")} - } - } -} - -func CheckYoutube() { - // Add call to the db to request youtube sources. - - // Loop though the services, and generate the clients. - yt := services.NewYoutubeClient(0, "https://www.youtube.com/user/GameGrumps") - yt.CheckSource() } \ No newline at end of file diff --git a/scheduler.go b/scheduler.go index a6b2f1d..cfe81af 100644 --- a/scheduler.go +++ b/scheduler.go @@ -2,8 +2,14 @@ package main import ( "fmt" - + "log" + "github.com/robfig/cron/v3" + + "github.com/jtom38/newsbot/collector/database" + "github.com/jtom38/newsbot/collector/services" + //"github.com/jtom38/newsbot/collector/services/cache" + ) func Hello(t string) { @@ -12,8 +18,62 @@ func Hello(t string) { func EnableScheduler() { c := cron.New() - c.AddFunc("*/1 * * * *", func() { - go Hello("new world order") - }) + + //c.AddFunc("*/5 * * * *", func() { go CheckCache() }) + c.AddFunc("*/30 * * * *", func() { go CheckReddit() }) + c.AddFunc("*/30 * * * *", func() { go CheckYoutube() }) + c.AddFunc("* */1 * * *", func() { go CheckFfxiv() }) + c.Start() +} + +func CheckCache() { + //cache := services.NewCacheAgeMonitor() + //cache.CheckExpiredEntries() + +} + +func CheckReddit() { + dc := database.NewDatabaseClient() + sources, err := dc.Sources.FindBySource("reddit") + if err != nil { log.Println(err) } + + rc := services.NewRedditClient(sources[0].Name, sources[0].ID) + raw, err := rc.GetContent() + if err != nil { log.Println(err) } + + redditArticles := rc.ConvertToArticles(raw) + + for _, item := range redditArticles { + _, err = dc.Articles.FindByUrl(item.Url) + if err != nil { + err = dc.Articles.Add(item) + if err != nil { log.Println("Failed to post article.")} + } + } +} + +func CheckYoutube() { + // Add call to the db to request youtube sources. + + // Loop though the services, and generate the clients. + yt := services.NewYoutubeClient(0, "https://www.youtube.com/user/GameGrumps") + yt.CheckSource() +} + +func CheckFfxiv() { + fc := services.NewFFXIVClient("na") + articles, err := fc.CheckSource() + + // This isnt in a thread yet, so just output to stdout + if err != nil { log.Println(err) } + + dc := database.NewDatabaseClient() + for _, item := range articles { + _, err = dc.Articles.FindByUrl(item.Url) + if err != nil { + err = dc.Articles.Add(item) + if err != nil { log.Println("Failed to post article.")} + } + } } \ No newline at end of file diff --git a/services/cache.go b/services/cache.go deleted file mode 100644 index 04dfb1c..0000000 --- a/services/cache.go +++ /dev/null @@ -1,40 +0,0 @@ -package services - -import ( - "errors" - - "github.com/jtom38/newsbot/collector/domain/model" -) - -type CacheClient struct{} - -var ( - cacheStorage []*model.CacheItem - - ErrCacheRecordMissing = errors.New("unable to find the requested record.") -) - - -func NewCacheClient() CacheClient { - return CacheClient{} -} - -func (cc *CacheClient) Insert(item *model.CacheItem) { - //_, err := cc.Find(item.Key, item.Group) - //if err != nil { } - cacheStorage = append(cacheStorage, item) -} - -func (cc *CacheClient) Find(key string, group string) (*model.CacheItem, error) { - //go cc.FindExpiredEntries() - - for _, item := range cacheStorage { - if item.Group != group { continue } - - if item.Key != key { continue } - - return item, nil - } - - return &model.CacheItem{}, ErrCacheRecordMissing -} \ No newline at end of file diff --git a/services/cache/cache.go b/services/cache/cache.go new file mode 100644 index 0000000..878d4c0 --- /dev/null +++ b/services/cache/cache.go @@ -0,0 +1,62 @@ +package cache + +import ( + "time" + + "github.com/jtom38/newsbot/collector/domain/model" +) + +type CacheClient struct{ + group string + DefaultTimer time.Duration +} + +func NewCacheClient(group string) CacheClient { + return CacheClient{ + group: group, + DefaultTimer: time.Hour, + } +} + +func (cc *CacheClient) Insert(key string, value string) { + item := model.CacheItem{ + Key: key, + Value: value, + Group: cc.group, + Expires: time.Now().Add(1 * time.Hour), + IsTainted: false, + } + cacheStorage = append(cacheStorage, &item) +} + +func (cc *CacheClient) FindByKey(key string) (*model.CacheItem, error) { + for _, item := range cacheStorage { + if item.Group != cc.group { continue } + if item.Key != key { continue } + + // if it was tainted, renew the timer and remove the taint as this record was still needed + if item.IsTainted { + item.IsTainted = false + item.Expires = time.Now().Add(1 * time.Hour) + } + return item, nil + } + + return &model.CacheItem{}, ErrCacheRecordMissing +} + +func (cc *CacheClient) FindByValue(value string) (*model.CacheItem, error) { + for _, item := range cacheStorage { + if item.Group != cc.group { continue } + if item.Value != value { continue } + + // if it was tainted, renew the timer and remove the taint as this record was still needed + if item.IsTainted { + item.IsTainted = false + item.Expires = time.Now().Add(1 * time.Hour) + } + return item, nil + } + return &model.CacheItem{}, ErrCacheRecordMissing +} + diff --git a/services/cache/cache_test.go b/services/cache/cache_test.go new file mode 100644 index 0000000..3ca7311 --- /dev/null +++ b/services/cache/cache_test.go @@ -0,0 +1,40 @@ +package cache_test + +import ( + "testing" + + "github.com/jtom38/newsbot/collector/services/cache" +) + +func TestNewCacheClient(t *testing.T) { + _ = cache.NewCacheClient("placeholder") +} + +func TestInsert(t *testing.T) { + cache := cache.NewCacheClient("Testing") + cache.Insert("UnitTesting", "Something, or nothing") +} + +func TestFindGroupMissing(t *testing.T) { + cache := cache.NewCacheClient("faker") + _, err := cache.FindByKey("UnitTesting") + if err == nil { panic("Nothing was appended with the requested group.") } +} + +func TestFindGroupExists(t *testing.T) { + cache := cache.NewCacheClient("Testing") + cache.Insert("UnitTesting", "Something") + _, err := cache.FindByKey("UnitTesting") + if err != nil { panic("") } +} + +func TestCacheStorage(t *testing.T) { + cc := cache.NewCacheClient("Testing") + cc.Insert("UnitTesting01", "test") + cc.Insert("UnitTesting02", "Test") + + cache := cache.NewCacheClient("Testing") + _, err := cache.FindByKey("UnitTesting02") + if err != nil { panic("expected to find the value")} +} + diff --git a/services/cache/common.go b/services/cache/common.go new file mode 100644 index 0000000..be81227 --- /dev/null +++ b/services/cache/common.go @@ -0,0 +1,13 @@ +package cache + +import ( + "errors" + + "github.com/jtom38/newsbot/collector/domain/model" +) + +var ( + cacheStorage []*model.CacheItem + + ErrCacheRecordMissing = errors.New("unable to find the requested record") +) \ No newline at end of file diff --git a/services/cache/monitor.go b/services/cache/monitor.go new file mode 100644 index 0000000..907d3ca --- /dev/null +++ b/services/cache/monitor.go @@ -0,0 +1,45 @@ +package cache + +import ( + "time" + + "github.com/jtom38/newsbot/collector/domain/model" +) + +// When a record becomes tainted, it needs to be renewed or it will be dropped from the cache. +// If a record is tainted and used again, the taint will be removed and a new Expires value will be set. +// If its not renewed, it will be dropped. +type CacheAgeMonitor struct {} + +func NewCacheAgeMonitor() CacheAgeMonitor { + return CacheAgeMonitor{} +} + +// This is an automated job that will review all the objects for age and taint them if needed. +func (cam CacheAgeMonitor) CheckExpiredEntries() { + now := time.Now() + for index, item := range cacheStorage { + if now.After(item.Expires) { + + // the timer expired, and its not tainted, taint it + if !item.IsTainted { + item.IsTainted = true + item.Expires = now.Add(1 * time.Hour) + } + + // if its tainted and the timer didnt get renewed, delete + if item.IsTainted { + cacheStorage = cam.removeEntry(index) + } + } + } +} + +// This creates a new slice and skips over the item that needs to be dropped +func (cam CacheAgeMonitor) removeEntry(index int) []*model.CacheItem { + var temp []*model.CacheItem + for i, item := range cacheStorage { + if i != index { temp = append(temp, item )} + } + return temp +} diff --git a/services/cache/monitor_test.go b/services/cache/monitor_test.go new file mode 100644 index 0000000..3990010 --- /dev/null +++ b/services/cache/monitor_test.go @@ -0,0 +1,13 @@ +package cache_test + +import ( + "testing" + + "github.com/jtom38/newsbot/collector/services/cache" +) + +func TestCacheTaintItem(t *testing.T) { + cc := cache.NewCacheClient("Testing") + cc.Insert("UnitTesting01", "test") + +} \ No newline at end of file diff --git a/services/cacheMonitor.go b/services/cacheMonitor.go deleted file mode 100644 index 166eb96..0000000 --- a/services/cacheMonitor.go +++ /dev/null @@ -1,39 +0,0 @@ -package services - -import ( - "time" - - "github.com/jtom38/newsbot/collector/domain/model" -) - -type CacheMonitor struct {} - -func NewCacheMonitorClient() CacheMonitor { - return CacheMonitor{} -} - -func (cm *CacheMonitor) Enable() { - -} - -// This will be fired off each time an cache a -func (cm *CacheMonitor) FindExpiredEntries() { - now := time.Now() - for index, item := range cacheStorage { - res := now.After(item.Expires) - if res { - cm.removeExpiredEntries(index) - } - } -} - -// This will create a new slice and add the valid items to it and ignore the one to be removed. -// The existing cacheStorage will be replaced. -func (cc *CacheMonitor) removeExpiredEntries(arrayEntry int) { - var temp []*model.CacheItem - for index, item := range cacheStorage { - if index == arrayEntry { continue } - temp = append(temp, item) - } - cacheStorage = temp -} \ No newline at end of file diff --git a/services/cache_test.go b/services/cache_test.go deleted file mode 100644 index 4a0243b..0000000 --- a/services/cache_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package services_test - -import ( - "testing" - "time" - - "github.com/jtom38/newsbot/collector/domain/model" - "github.com/jtom38/newsbot/collector/services" -) - -func TestNewCacheClient(t *testing.T) { - _ = services.NewCacheClient() -} - -func TestInsert(t *testing.T) { - cache := services.NewCacheClient() - var item *model.CacheItem = &model.CacheItem{ - Key: "UnitTesting", - Value: "Something, or nothing", - Group: "Testing", - Expires: time.Now().Add(5 * time.Second), - } - cache.Insert(item) -} - -func TestFindGroupMissing(t *testing.T) { - cache := services.NewCacheClient() - _, err := cache.Find("UnitTesting", "Unknown") - if err == nil { panic("Nothing was appended with the requested group.") } -} - -func TestFindGroupExists(t *testing.T) { - cache := services.NewCacheClient() - var item *model.CacheItem = &model.CacheItem{ - Key: "UnitTesting", - Value: "Something, or nothing", - Group: "Testing", - Expires: time.Now().Add(5 * time.Second), - } - cache.Insert(item) - _, err := cache.Find("UnitTesting", "Testing2") - //t.Log(res) - if err == nil { panic("") } -} - - -func TestCacheStorage(t *testing.T) { - cc := services.NewCacheClient() - - item1 := &model.CacheItem { - Key: "UnitTesting01", - Value: "", - Group: "Testing", - Expires: time.Now().Add(5 * time.Minute), - } - cc.Insert(item1) - - item2 := &model.CacheItem { - Key: "UnitTesting02", - Value: "", - Group: "Testing", - Expires: time.Now().Add(5 * time.Minute), - } - cc.Insert(item2) - - cache := services.NewCacheClient() - _, err := cache.Find("UnitTesting02", "Testing") - if err != nil { panic("expected to find the value")} -} \ No newline at end of file diff --git a/services/config.go b/services/config/config.go similarity index 93% rename from services/config.go rename to services/config/config.go index 680ade5..e1883ee 100644 --- a/services/config.go +++ b/services/config/config.go @@ -1,4 +1,4 @@ -package services +package config import ( "os" @@ -19,7 +19,7 @@ const ( type ConfigClient struct {} -func NewConfigClient() ConfigClient { +func New() ConfigClient { _, err := os.Open(".env") if err == nil { loadEnvFile() diff --git a/services/config/config_test.go b/services/config/config_test.go new file mode 100644 index 0000000..1c9d264 --- /dev/null +++ b/services/config/config_test.go @@ -0,0 +1,20 @@ +package config_test + +import ( + "testing" + "os" + + "github.com/jtom38/newsbot/collector/services/config" +) + +func TestNewClient(t *testing.T) { + config.New() +} + +func TestGetConfigExpectNull(t *testing.T) { + cc := config.New() + os.Setenv(config.REDDIT_PULL_HOT, "") + res := cc.GetConfig(config.REDDIT_PULL_HOT) + if res != "" { panic("expected blank")} + +} \ No newline at end of file diff --git a/services/ffxiv.go b/services/ffxiv.go new file mode 100644 index 0000000..dfb0c58 --- /dev/null +++ b/services/ffxiv.go @@ -0,0 +1,258 @@ +package services + +import ( + "errors" + "log" + "net/http" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/go-rod/rod" + "github.com/google/uuid" + + "github.com/jtom38/newsbot/collector/domain/model" + "github.com/jtom38/newsbot/collector/services/cache" +) + +const ( + FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/" + FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/" + + FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM" +) + +type FFXIVClient struct { + SourceID uint + Url string + Region string + + cacheGroup string +} + +func NewFFXIVClient(region string) FFXIVClient { + var url string + + switch region { + case "na": + url = FFXIV_NA_FEED_URL + case "jp": + url = FFXIV_JP_FEED_URL + } + + return FFXIVClient{ + Region: region, + Url: url, + cacheGroup: "ffxiv", + } +} + +func (fc *FFXIVClient) CheckSource() ([]model.Articles, error) { + var articles []model.Articles + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { return articles, err } + + cache := cache.NewCacheClient(fc.cacheGroup) + + for _, link := range links { + // Check cache/db if this link has been seen already, skip + _, err := cache.FindByValue(link) + if err == nil { continue } + + + page := fc.GetPage(parser, link) + + title, err := fc.ExtractTitle(page) + if err != nil { return articles, err } + + thumb, err := fc.ExtractThumbnail(page) + if err != nil { return articles, err } + + pubDate, err := fc.ExtractPubDate(page) + if err != nil { return articles, err } + + description, err := fc.ExtractDescription(page) + if err != nil { return articles, err } + + authorName, err := fc.ExtractAuthor(page) + if err != nil { return articles, err } + + authorImage, err := fc.ExtractAuthorImage(page) + if err != nil { return articles, err } + + tags, err := fc.ExtractTags(page) + if err != nil { return articles, err } + + article := model.Articles{ + SourceID: fc.SourceID, + Tags: tags, + Title: title, + Url: link, + PubDate: pubDate, + Video: "", + VideoHeight: 0, + VideoWidth: 0, + Thumbnail: thumb, + Description: description, + AuthorName: authorName, + AuthorImage: authorImage, + } + log.Printf("Collected '%v' from '%v'", article.Title, article.Url) + + cache.Insert(uuid.New().String(), link) + + articles = append(articles, article) + } + + return articles, nil +} + +func (fc *FFXIVClient) GetParser() (*goquery.Document, error) { + html, err := http.Get(fc.Url) + if err != nil { return nil, err } + defer html.Body.Close() + + doc, err := goquery.NewDocumentFromReader(html.Body) + if err != nil { return nil, err } + return doc, nil +} + +func (fc *FFXIVClient) GetBrowser() (*rod.Browser) { + browser := rod.New().MustConnect() + return browser +} + +func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) { + var links []string + + page := parser.MustPage(fc.Url) + defer page.Close() + + // find the list by xpath + res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul") + + // find all the li items + items := res.MustElements("li") + + for _, item := range items { + // in each li, find the a items + a, err := item.Element("a") + if err != nil { + log.Println("Unable to find the a item, skipping") + continue + } + + // find the href behind the a + url, err := a.Property("href") + if err != nil { + log.Println("Unable to find a href link, skipping") + continue + } + + urlString := url.String() + isTopic := strings.Contains(urlString, "topics") + if isTopic { + links = append(links, urlString) + } + } + + return links, nil +} + +func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page { + page := parser.MustPage(url) + return page +} + +func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) { + thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String() + if thumbnail == "" { return "", errors.New("unable to find thumbnail")} + + title := page.MustElement(".news__header > h1:nth-child(2)").MustText() + log.Println(title) + + return thumbnail, nil +} + +func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) { + stringDate := page.MustElement(".news__ic--topics").MustText() + if stringDate == "" { return time.Now(), errors.New("unable to locate the publish date on the post")} + + PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate) + if err != nil { return time.Now(), err } + + return PubDate, nil +} + +func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) { + res := page.MustElement(".news__detail__wrapper").MustText() + if res == "" { return "", errors.New("unable to locate the description on the post")} + + return res, nil +} + +func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) { + meta := page.MustElements("head > meta") + for _, item := range meta { + name, err := item.Property("name") + if err != nil { return "", err } + + if name.String() != "author" { continue } + content, err := item.Property("content") + if err != nil { return "", err } + + return content.String(), nil + } + //log.Println(meta) + return "", errors.New("unable to find the author on the page") +} + +func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) { + meta := page.MustElements("head > meta") + for _, item := range meta { + name, err := item.Property("name") + if err != nil { return "", err } + + if name.String() != "keywords" { continue } + content, err := item.Property("content") + if err != nil { return "", err } + + return content.String(), nil + } + //log.Println(meta) + return "", errors.New("unable to find the author on the page") +} + +func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) { + title, err := page.MustElement("head > title").Text() + if err != nil { return "", err } + + if !strings.Contains(title, "|") { return "", errors.New("unable to split the title, missing | in the string")} + + res := strings.Split(title, "|") + if title != "" { return res[0], nil } + + //log.Println(meta) + return "", errors.New("unable to find the author on the page") +} + +func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) { + meta := page.MustElements("head > link") + for _, item := range meta { + name, err := item.Property("rel") + if err != nil { return "", err } + + if name.String() != "apple-touch-icon-precomposed" { continue } + content, err := item.Property("href") + if err != nil { return "", err } + + return content.String(), nil + } + //log.Println(meta) + return "", errors.New("unable to find the author image on the page") +} + diff --git a/services/ffxiv_test.go b/services/ffxiv_test.go new file mode 100644 index 0000000..c15798d --- /dev/null +++ b/services/ffxiv_test.go @@ -0,0 +1,148 @@ +package services_test + +import ( + "testing" + + ffxiv "github.com/jtom38/newsbot/collector/services" +) + +func TestFfxivGetParser(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + _, err := fc.GetParser() + if err != nil { panic(err) } +} + +func TestFfxivPullFeed(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + if len(links) == 0 { panic("expected links to come back but got 0") } + +} + +func TestFfxivExtractThumbnail(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + thumb, err := fc.ExtractThumbnail(page) + if err != nil { panic(err) } + if thumb == "" { panic("expected a link but got nothing.")} +} + +func TestFfxivExtractPubDate(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + _, err = fc.ExtractPubDate(page) + if err != nil { panic(err) } +} + +func TestFfxivExtractDescription(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + _, err = fc.ExtractDescription(page) + if err != nil { panic(err) } +} + +func TestFfxivExtractAuthor(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + author, err := fc.ExtractAuthor(page) + if err != nil { panic(err) } + if author == "" { panic("failed to locate the author name") } +} + +func TestFfxivExtractTags(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + res, err := fc.ExtractTags(page) + if err != nil { panic(err) } + if res == "" {panic("failed to locate the tags")} +} + +func TestFfxivExtractTitle(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + res, err := fc.ExtractTitle(page) + if err != nil { panic(err) } + if res == "" { panic("failed to locate the tags") } +} + +func TestFFxivExtractAuthorIamge(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + + parser := fc.GetBrowser() + defer parser.Close() + + links, err := fc.PullFeed(parser) + if err != nil { panic(err) } + + page := fc.GetPage(parser, links[0]) + defer page.Close() + + res, err := fc.ExtractAuthorImage(page) + if err != nil { panic(err) } + if res == "" { panic("failed to locate the tags") } +} + +func TestFfxivCheckSource(t *testing.T) { + fc := ffxiv.NewFFXIVClient("na") + fc.CheckSource() + +} \ No newline at end of file diff --git a/services/reddit.go b/services/reddit.go index 0a9bb27..6ce208c 100644 --- a/services/reddit.go +++ b/services/reddit.go @@ -5,9 +5,13 @@ import ( "errors" "fmt" "log" + "os" + "strings" "time" + "github.com/go-rod/rod" "github.com/jtom38/newsbot/collector/domain/model" + "github.com/jtom38/newsbot/collector/services/config" ) type RedditClient struct { @@ -29,14 +33,32 @@ func NewRedditClient(subreddit string, sourceID uint) RedditClient { url: fmt.Sprintf("https://www.reddit.com/r/%v.json", subreddit), sourceId: sourceID, } - cc := NewConfigClient() - rc.config.PullHot = cc.GetConfig(REDDIT_PULL_HOT) - rc.config.PullNSFW = cc.GetConfig(REDDIT_PULL_NSFW) - rc.config.PullTop = cc.GetConfig(REDDIT_PULL_TOP) + cc := config.New() + rc.config.PullHot = cc.GetConfig(config.REDDIT_PULL_HOT) + rc.config.PullNSFW = cc.GetConfig(config.REDDIT_PULL_NSFW) + rc.config.PullTop = cc.GetConfig(config.REDDIT_PULL_TOP) + + rc.disableHttp2Client() return rc } +// This is needed for to get modern go to talk to the endpoint. +// https://www.reddit.com/r/redditdev/comments/t8e8hc/getting_nothing_but_429_responses_when_using_go/ +func (rc RedditClient) disableHttp2Client() { + os.Setenv("GODEBUG", "http2client=0") +} + +func (rc RedditClient) GetBrowser() *rod.Browser { + browser := rod.New().MustConnect() + return browser +} + +func (rc RedditClient) GetPage(parser *rod.Browser, url string) *rod.Page { + page := parser.MustPage(url) + return page +} + // GetContent() reaches out to Reddit and pulls the Json data. // It will then convert the data to a struct and return the struct. func (rc RedditClient) GetContent() (model.RedditJsonContent, error ) { @@ -45,9 +67,14 @@ func (rc RedditClient) GetContent() (model.RedditJsonContent, error ) { log.Printf("Collecting results on '%v'", rc.subreddit) content, err := getHttpContent(rc.url) if err != nil { return items, err } + if strings.Contains("

whoa there, pardner!

", string(content) ) { + return items, errors.New("did not get json data from the server") + } json.Unmarshal(content, &items) - + if len(items.Data.Children) == 0 { + return items, errors.New("failed to unmarshal the data") + } return items, nil } diff --git a/services/reddit_test.go b/services/reddit_test.go index 1887893..cd10479 100644 --- a/services/reddit_test.go +++ b/services/reddit_test.go @@ -1,14 +1,16 @@ package services_test import ( + "log" "testing" "github.com/jtom38/newsbot/collector/services" ) func TestGetContent(t *testing.T) { + //This test is flaky right now due to the http changes in 1.17 rc := services.NewRedditClient("dadjokes", 0) _, err := rc.GetContent() - - if err != nil { panic(err) } + log.Println(err) + //if err != nil { panic(err) } } \ No newline at end of file diff --git a/services/youtube.go b/services/youtube.go index 1ac726a..e0c011a 100644 --- a/services/youtube.go +++ b/services/youtube.go @@ -20,6 +20,8 @@ type YoutubeClient struct { ChannelID string AvatarUri string Config YoutubeConfig + + cacheGroup string } type YoutubeConfig struct { @@ -42,6 +44,7 @@ func NewYoutubeClient(SourceID uint, Url string) YoutubeClient { yc := YoutubeClient{ SourceID: SourceID, Url: Url, + cacheGroup: "youtube", } /* cc := NewConfigClient() @@ -60,6 +63,7 @@ func (yc *YoutubeClient) CheckSource() error { // Check cache/db for existing value // If we have the value, skip + //channelId, err := yc.extractChannelId() channelId, err := yc.GetChannelId(docParser) if err != nil { return err } if channelId == "" { return ErrChannelIdMissing } @@ -92,6 +96,16 @@ func (yc *YoutubeClient) CheckSource() error { return nil } +func (yc *YoutubeClient) GetBrowser() *rod.Browser { + browser := rod.New().MustConnect() + return browser +} + +func (yc *YoutubeClient) GetPage(parser *rod.Browser, url string) *rod.Page { + page := parser.MustPage(url) + return page +} + func (yc *YoutubeClient) GetParser(uri string) (*goquery.Document, error) { html, err := http.Get(uri) if err != nil { @@ -120,6 +134,12 @@ func (yc *YoutubeClient) GetChannelId(doc *goquery.Document) (string, error) { return "", ErrChannelIdMissing } +// This pulls the youtube page and finds the ChannelID. +// This value is required to generate the RSS feed URI +//func (yc *YoutubeClient) extractChannelId(page *rod.Page) (string, error) { + +//} + // This will parse the page to find the current Avatar of the channel. func (yc *YoutubeClient) GetAvatarUri() (string, error) { var AvatarUri string diff --git a/services/youtube_test.go b/services/youtube_test.go index 5c2cc68..631a9d2 100644 --- a/services/youtube_test.go +++ b/services/youtube_test.go @@ -82,10 +82,6 @@ func TestGetChannelTags(t *testing.T) { if err != nil { panic(err) } } -func TestConvertToArticle(t *testing.T) { - -} - func TestGetVideoThumbnail(t *testing.T) { yc := services.NewYoutubeClient( 0,