James Tombleson
11892b9a7b
* starting the ffxiv reader * working on getting the standard interface for sources based on the work for ffxiv * got more of ffxiv working and updated tests * Author and Description can be extracted and validated with tests * added uuid package * ffxiv core logic is working and testes updated to reflect it. * Updated the scheduler with the current sources and moved them from main * updated reddit to allow modern go to talk to the endpoint with a debug flag * gave the func a better name * cleaned up main * Moved cache to its own package and updated tests" * moved config to its own package and added basic tests * updated imports * minor update" * interface update and cache model update * updated the scheduler for basic services. No DB calls yet * updated db calls * bypassed the reddit test as its flaky in github
259 lines
6.4 KiB
Go
259 lines
6.4 KiB
Go
package services
|
|
|
|
import (
|
|
"errors"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/go-rod/rod"
|
|
"github.com/google/uuid"
|
|
|
|
"github.com/jtom38/newsbot/collector/domain/model"
|
|
"github.com/jtom38/newsbot/collector/services/cache"
|
|
)
|
|
|
|
const (
|
|
FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/"
|
|
FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/"
|
|
|
|
FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM"
|
|
)
|
|
|
|
type FFXIVClient struct {
|
|
SourceID uint
|
|
Url string
|
|
Region string
|
|
|
|
cacheGroup string
|
|
}
|
|
|
|
func NewFFXIVClient(region string) FFXIVClient {
|
|
var url string
|
|
|
|
switch region {
|
|
case "na":
|
|
url = FFXIV_NA_FEED_URL
|
|
case "jp":
|
|
url = FFXIV_JP_FEED_URL
|
|
}
|
|
|
|
return FFXIVClient{
|
|
Region: region,
|
|
Url: url,
|
|
cacheGroup: "ffxiv",
|
|
}
|
|
}
|
|
|
|
func (fc *FFXIVClient) CheckSource() ([]model.Articles, error) {
|
|
var articles []model.Articles
|
|
|
|
parser := fc.GetBrowser()
|
|
defer parser.Close()
|
|
|
|
links, err := fc.PullFeed(parser)
|
|
if err != nil { return articles, err }
|
|
|
|
cache := cache.NewCacheClient(fc.cacheGroup)
|
|
|
|
for _, link := range links {
|
|
// Check cache/db if this link has been seen already, skip
|
|
_, err := cache.FindByValue(link)
|
|
if err == nil { continue }
|
|
|
|
|
|
page := fc.GetPage(parser, link)
|
|
|
|
title, err := fc.ExtractTitle(page)
|
|
if err != nil { return articles, err }
|
|
|
|
thumb, err := fc.ExtractThumbnail(page)
|
|
if err != nil { return articles, err }
|
|
|
|
pubDate, err := fc.ExtractPubDate(page)
|
|
if err != nil { return articles, err }
|
|
|
|
description, err := fc.ExtractDescription(page)
|
|
if err != nil { return articles, err }
|
|
|
|
authorName, err := fc.ExtractAuthor(page)
|
|
if err != nil { return articles, err }
|
|
|
|
authorImage, err := fc.ExtractAuthorImage(page)
|
|
if err != nil { return articles, err }
|
|
|
|
tags, err := fc.ExtractTags(page)
|
|
if err != nil { return articles, err }
|
|
|
|
article := model.Articles{
|
|
SourceID: fc.SourceID,
|
|
Tags: tags,
|
|
Title: title,
|
|
Url: link,
|
|
PubDate: pubDate,
|
|
Video: "",
|
|
VideoHeight: 0,
|
|
VideoWidth: 0,
|
|
Thumbnail: thumb,
|
|
Description: description,
|
|
AuthorName: authorName,
|
|
AuthorImage: authorImage,
|
|
}
|
|
log.Printf("Collected '%v' from '%v'", article.Title, article.Url)
|
|
|
|
cache.Insert(uuid.New().String(), link)
|
|
|
|
articles = append(articles, article)
|
|
}
|
|
|
|
return articles, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) GetParser() (*goquery.Document, error) {
|
|
html, err := http.Get(fc.Url)
|
|
if err != nil { return nil, err }
|
|
defer html.Body.Close()
|
|
|
|
doc, err := goquery.NewDocumentFromReader(html.Body)
|
|
if err != nil { return nil, err }
|
|
return doc, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) GetBrowser() (*rod.Browser) {
|
|
browser := rod.New().MustConnect()
|
|
return browser
|
|
}
|
|
|
|
func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) {
|
|
var links []string
|
|
|
|
page := parser.MustPage(fc.Url)
|
|
defer page.Close()
|
|
|
|
// find the list by xpath
|
|
res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul")
|
|
|
|
// find all the li items
|
|
items := res.MustElements("li")
|
|
|
|
for _, item := range items {
|
|
// in each li, find the a items
|
|
a, err := item.Element("a")
|
|
if err != nil {
|
|
log.Println("Unable to find the a item, skipping")
|
|
continue
|
|
}
|
|
|
|
// find the href behind the a
|
|
url, err := a.Property("href")
|
|
if err != nil {
|
|
log.Println("Unable to find a href link, skipping")
|
|
continue
|
|
}
|
|
|
|
urlString := url.String()
|
|
isTopic := strings.Contains(urlString, "topics")
|
|
if isTopic {
|
|
links = append(links, urlString)
|
|
}
|
|
}
|
|
|
|
return links, nil
|
|
}
|
|
|
|
func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page {
|
|
page := parser.MustPage(url)
|
|
return page
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) {
|
|
thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String()
|
|
if thumbnail == "" { return "", errors.New("unable to find thumbnail")}
|
|
|
|
title := page.MustElement(".news__header > h1:nth-child(2)").MustText()
|
|
log.Println(title)
|
|
|
|
return thumbnail, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) {
|
|
stringDate := page.MustElement(".news__ic--topics").MustText()
|
|
if stringDate == "" { return time.Now(), errors.New("unable to locate the publish date on the post")}
|
|
|
|
PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate)
|
|
if err != nil { return time.Now(), err }
|
|
|
|
return PubDate, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) {
|
|
res := page.MustElement(".news__detail__wrapper").MustText()
|
|
if res == "" { return "", errors.New("unable to locate the description on the post")}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > meta")
|
|
for _, item := range meta {
|
|
name, err := item.Property("name")
|
|
if err != nil { return "", err }
|
|
|
|
if name.String() != "author" { continue }
|
|
content, err := item.Property("content")
|
|
if err != nil { return "", err }
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > meta")
|
|
for _, item := range meta {
|
|
name, err := item.Property("name")
|
|
if err != nil { return "", err }
|
|
|
|
if name.String() != "keywords" { continue }
|
|
content, err := item.Property("content")
|
|
if err != nil { return "", err }
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) {
|
|
title, err := page.MustElement("head > title").Text()
|
|
if err != nil { return "", err }
|
|
|
|
if !strings.Contains(title, "|") { return "", errors.New("unable to split the title, missing | in the string")}
|
|
|
|
res := strings.Split(title, "|")
|
|
if title != "" { return res[0], nil }
|
|
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > link")
|
|
for _, item := range meta {
|
|
name, err := item.Property("rel")
|
|
if err != nil { return "", err }
|
|
|
|
if name.String() != "apple-touch-icon-precomposed" { continue }
|
|
content, err := item.Property("href")
|
|
if err != nil { return "", err }
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author image on the page")
|
|
}
|
|
|