2022-06-30 14:54:58 -07:00
|
|
|
package input
|
2022-04-29 13:02:25 -07:00
|
|
|
|
|
|
|
import (
|
2022-06-08 21:17:08 -07:00
|
|
|
"database/sql"
|
2022-04-29 13:02:25 -07:00
|
|
|
"errors"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/go-rod/rod"
|
2022-07-14 14:27:40 -07:00
|
|
|
"github.com/go-rod/rod/lib/launcher"
|
2022-04-29 13:02:25 -07:00
|
|
|
"github.com/google/uuid"
|
|
|
|
|
2022-06-08 21:17:08 -07:00
|
|
|
"github.com/jtom38/newsbot/collector/database"
|
2022-04-29 13:02:25 -07:00
|
|
|
"github.com/jtom38/newsbot/collector/services/cache"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/"
|
|
|
|
FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/"
|
|
|
|
|
|
|
|
FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM"
|
|
|
|
)
|
|
|
|
|
|
|
|
type FFXIVClient struct {
|
2022-06-08 21:17:08 -07:00
|
|
|
record database.Source
|
|
|
|
//SourceID uint
|
|
|
|
//Url string
|
|
|
|
//Region string
|
2022-04-29 13:02:25 -07:00
|
|
|
|
|
|
|
cacheGroup string
|
|
|
|
}
|
|
|
|
|
2022-06-08 21:17:08 -07:00
|
|
|
func NewFFXIVClient(Record database.Source) FFXIVClient {
|
2022-04-29 13:02:25 -07:00
|
|
|
return FFXIVClient{
|
2022-06-08 21:17:08 -07:00
|
|
|
record: Record,
|
2022-04-29 13:02:25 -07:00
|
|
|
cacheGroup: "ffxiv",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-08 21:17:08 -07:00
|
|
|
func (fc *FFXIVClient) CheckSource() ([]database.Article, error) {
|
|
|
|
var articles []database.Article
|
2022-04-29 13:02:25 -07:00
|
|
|
|
|
|
|
parser := fc.GetBrowser()
|
|
|
|
defer parser.Close()
|
|
|
|
|
|
|
|
links, err := fc.PullFeed(parser)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
cache := cache.NewCacheClient(fc.cacheGroup)
|
|
|
|
|
|
|
|
for _, link := range links {
|
|
|
|
// Check cache/db if this link has been seen already, skip
|
|
|
|
_, err := cache.FindByValue(link)
|
|
|
|
if err == nil { continue }
|
|
|
|
|
|
|
|
|
|
|
|
page := fc.GetPage(parser, link)
|
|
|
|
|
|
|
|
title, err := fc.ExtractTitle(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
thumb, err := fc.ExtractThumbnail(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
pubDate, err := fc.ExtractPubDate(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
description, err := fc.ExtractDescription(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
authorName, err := fc.ExtractAuthor(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
authorImage, err := fc.ExtractAuthorImage(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
|
|
|
tags, err := fc.ExtractTags(page)
|
|
|
|
if err != nil { return articles, err }
|
|
|
|
|
2022-06-08 21:17:08 -07:00
|
|
|
article := database.Article{
|
|
|
|
Sourceid: fc.record.ID,
|
2022-04-29 13:02:25 -07:00
|
|
|
Tags: tags,
|
|
|
|
Title: title,
|
|
|
|
Url: link,
|
2022-06-08 21:17:08 -07:00
|
|
|
Pubdate: pubDate,
|
|
|
|
Videoheight: 0,
|
|
|
|
Videowidth: 0,
|
2022-04-29 13:02:25 -07:00
|
|
|
Thumbnail: thumb,
|
|
|
|
Description: description,
|
2022-06-08 21:17:08 -07:00
|
|
|
Authorname: sql.NullString{String: authorName},
|
|
|
|
Authorimage: sql.NullString{String: authorImage},
|
2022-04-29 13:02:25 -07:00
|
|
|
}
|
|
|
|
log.Printf("Collected '%v' from '%v'", article.Title, article.Url)
|
|
|
|
|
|
|
|
cache.Insert(uuid.New().String(), link)
|
|
|
|
|
|
|
|
articles = append(articles, article)
|
|
|
|
}
|
|
|
|
|
|
|
|
return articles, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) GetParser() (*goquery.Document, error) {
|
2022-06-08 21:17:08 -07:00
|
|
|
html, err := http.Get(fc.record.Url)
|
2022-04-29 13:02:25 -07:00
|
|
|
if err != nil { return nil, err }
|
|
|
|
defer html.Body.Close()
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(html.Body)
|
|
|
|
if err != nil { return nil, err }
|
|
|
|
return doc, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) GetBrowser() (*rod.Browser) {
|
2022-07-14 14:27:40 -07:00
|
|
|
var browser *rod.Browser
|
|
|
|
if path, exists := launcher.LookPath(); exists {
|
|
|
|
u := launcher.New().Bin(path).MustLaunch()
|
|
|
|
browser = rod.New().ControlURL(u).MustConnect()
|
|
|
|
}
|
2022-04-29 13:02:25 -07:00
|
|
|
return browser
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) {
|
|
|
|
var links []string
|
|
|
|
|
2022-06-08 21:17:08 -07:00
|
|
|
page := parser.MustPage(fc.record.Url)
|
2022-04-29 13:02:25 -07:00
|
|
|
defer page.Close()
|
|
|
|
|
|
|
|
// find the list by xpath
|
|
|
|
res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul")
|
|
|
|
|
|
|
|
// find all the li items
|
|
|
|
items := res.MustElements("li")
|
|
|
|
|
|
|
|
for _, item := range items {
|
|
|
|
// in each li, find the a items
|
|
|
|
a, err := item.Element("a")
|
|
|
|
if err != nil {
|
|
|
|
log.Println("Unable to find the a item, skipping")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// find the href behind the a
|
|
|
|
url, err := a.Property("href")
|
|
|
|
if err != nil {
|
|
|
|
log.Println("Unable to find a href link, skipping")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
urlString := url.String()
|
|
|
|
isTopic := strings.Contains(urlString, "topics")
|
|
|
|
if isTopic {
|
|
|
|
links = append(links, urlString)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return links, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page {
|
|
|
|
page := parser.MustPage(url)
|
|
|
|
return page
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) {
|
|
|
|
thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String()
|
|
|
|
if thumbnail == "" { return "", errors.New("unable to find thumbnail")}
|
|
|
|
|
|
|
|
title := page.MustElement(".news__header > h1:nth-child(2)").MustText()
|
|
|
|
log.Println(title)
|
|
|
|
|
|
|
|
return thumbnail, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) {
|
|
|
|
stringDate := page.MustElement(".news__ic--topics").MustText()
|
|
|
|
if stringDate == "" { return time.Now(), errors.New("unable to locate the publish date on the post")}
|
|
|
|
|
|
|
|
PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate)
|
|
|
|
if err != nil { return time.Now(), err }
|
|
|
|
|
|
|
|
return PubDate, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) {
|
|
|
|
res := page.MustElement(".news__detail__wrapper").MustText()
|
|
|
|
if res == "" { return "", errors.New("unable to locate the description on the post")}
|
|
|
|
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) {
|
|
|
|
meta := page.MustElements("head > meta")
|
|
|
|
for _, item := range meta {
|
|
|
|
name, err := item.Property("name")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
if name.String() != "author" { continue }
|
|
|
|
content, err := item.Property("content")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
return content.String(), nil
|
|
|
|
}
|
|
|
|
//log.Println(meta)
|
|
|
|
return "", errors.New("unable to find the author on the page")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) {
|
|
|
|
meta := page.MustElements("head > meta")
|
|
|
|
for _, item := range meta {
|
|
|
|
name, err := item.Property("name")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
if name.String() != "keywords" { continue }
|
|
|
|
content, err := item.Property("content")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
return content.String(), nil
|
|
|
|
}
|
|
|
|
//log.Println(meta)
|
|
|
|
return "", errors.New("unable to find the author on the page")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) {
|
|
|
|
title, err := page.MustElement("head > title").Text()
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
if !strings.Contains(title, "|") { return "", errors.New("unable to split the title, missing | in the string")}
|
|
|
|
|
|
|
|
res := strings.Split(title, "|")
|
|
|
|
if title != "" { return res[0], nil }
|
|
|
|
|
|
|
|
//log.Println(meta)
|
|
|
|
return "", errors.New("unable to find the author on the page")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) {
|
|
|
|
meta := page.MustElements("head > link")
|
|
|
|
for _, item := range meta {
|
|
|
|
name, err := item.Property("rel")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
if name.String() != "apple-touch-icon-precomposed" { continue }
|
|
|
|
content, err := item.Property("href")
|
|
|
|
if err != nil { return "", err }
|
|
|
|
|
|
|
|
return content.String(), nil
|
|
|
|
}
|
|
|
|
//log.Println(meta)
|
|
|
|
return "", errors.New("unable to find the author image on the page")
|
|
|
|
}
|
|
|
|
|