newsbot-api/internal/services/input/ffxiv.go

304 lines
6.6 KiB
Go

package input
import (
"errors"
"log"
"net/http"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/launcher"
"github.com/google/uuid"
"git.jamestombleson.com/jtom38/newsbot-api/internal/entity"
"git.jamestombleson.com/jtom38/newsbot-api/internal/services/cache"
)
const (
FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/"
FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/"
FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM"
)
type FFXIVClient struct {
record entity.SourceEntity
//SourceID uint
//Url string
//Region string
cacheGroup string
}
func NewFFXIVClient(Record entity.SourceEntity) FFXIVClient {
return FFXIVClient{
record: Record,
cacheGroup: "ffxiv",
}
}
func (fc *FFXIVClient) CheckSource() ([]entity.ArticleEntity, error) {
var articles []entity.ArticleEntity
parser := fc.GetBrowser()
defer parser.Close()
links, err := fc.PullFeed(parser)
if err != nil {
return articles, err
}
cache := cache.NewCacheClient(fc.cacheGroup)
for _, link := range links {
// Check cache/db if this link has been seen already, skip
_, err := cache.FindByValue(link)
if err == nil {
continue
}
page := fc.GetPage(parser, link)
title, err := fc.ExtractTitle(page)
if err != nil {
return articles, err
}
thumb, err := fc.ExtractThumbnail(page)
if err != nil {
return articles, err
}
pubDate, err := fc.ExtractPubDate(page)
if err != nil {
return articles, err
}
description, err := fc.ExtractDescription(page)
if err != nil {
return articles, err
}
authorName, err := fc.ExtractAuthor(page)
if err != nil {
return articles, err
}
authorImage, err := fc.ExtractAuthorImage(page)
if err != nil {
return articles, err
}
tags, err := fc.ExtractTags(page)
if err != nil {
return articles, err
}
article := entity.ArticleEntity{
SourceID: fc.record.ID,
Tags: tags,
Title: title,
Url: link,
PubDate: pubDate,
Thumbnail: thumb,
Description: description,
AuthorName: authorName,
AuthorImageUrl: authorImage,
}
log.Printf("Collected '%v' from '%v'", article.Title, article.Url)
cache.Insert(uuid.New().String(), link)
articles = append(articles, article)
}
return articles, nil
}
func (fc *FFXIVClient) GetParser() (*goquery.Document, error) {
html, err := http.Get(fc.record.Url)
if err != nil {
return nil, err
}
defer html.Body.Close()
doc, err := goquery.NewDocumentFromReader(html.Body)
if err != nil {
return nil, err
}
return doc, nil
}
func (fc *FFXIVClient) GetBrowser() *rod.Browser {
var browser *rod.Browser
if path, exists := launcher.LookPath(); exists {
u := launcher.New().Bin(path).MustLaunch()
browser = rod.New().ControlURL(u).MustConnect()
}
return browser
}
func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) {
var links []string
page := parser.MustPage(fc.record.Url)
defer page.Close()
// find the list by xpath
res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul")
// find all the li items
items := res.MustElements("li")
for _, item := range items {
// in each li, find the a items
a, err := item.Element("a")
if err != nil {
log.Println("Unable to find the a item, skipping")
continue
}
// find the href behind the a
url, err := a.Property("href")
if err != nil {
log.Println("Unable to find a href link, skipping")
continue
}
urlString := url.String()
isTopic := strings.Contains(urlString, "topics")
if isTopic {
links = append(links, urlString)
}
}
return links, nil
}
func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page {
page := parser.MustPage(url)
return page
}
func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) {
thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String()
if thumbnail == "" {
return "", errors.New("unable to find thumbnail")
}
title := page.MustElement(".news__header > h1:nth-child(2)").MustText()
log.Println(title)
return thumbnail, nil
}
func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) {
stringDate := page.MustElement(".news__ic--topics").MustText()
if stringDate == "" {
return time.Now(), errors.New("unable to locate the publish date on the post")
}
PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate)
if err != nil {
return time.Now(), err
}
return PubDate, nil
}
func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) {
res := page.MustElement(".news__detail__wrapper").MustText()
if res == "" {
return "", errors.New("unable to locate the description on the post")
}
return res, nil
}
func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) {
meta := page.MustElements("head > meta")
for _, item := range meta {
name, err := item.Property("name")
if err != nil {
return "", err
}
if name.String() != "author" {
continue
}
content, err := item.Property("content")
if err != nil {
return "", err
}
return content.String(), nil
}
//log.Println(meta)
return "", errors.New("unable to find the author on the page")
}
func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) {
meta := page.MustElements("head > meta")
for _, item := range meta {
name, err := item.Property("name")
if err != nil {
return "", err
}
if name.String() != "keywords" {
continue
}
content, err := item.Property("content")
if err != nil {
return "", err
}
return content.String(), nil
}
//log.Println(meta)
return "", errors.New("unable to find the author on the page")
}
func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) {
title, err := page.MustElement("head > title").Text()
if err != nil {
return "", err
}
if !strings.Contains(title, "|") {
return "", errors.New("unable to split the title, missing | in the string")
}
res := strings.Split(title, "|")
if title != "" {
return res[0], nil
}
//log.Println(meta)
return "", errors.New("unable to find the author on the page")
}
func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) {
meta := page.MustElements("head > link")
for _, item := range meta {
name, err := item.Property("rel")
if err != nil {
return "", err
}
if name.String() != "apple-touch-icon-precomposed" {
continue
}
content, err := item.Property("href")
if err != nil {
return "", err
}
return content.String(), nil
}
//log.Println(meta)
return "", errors.New("unable to find the author image on the page")
}