304 lines
6.6 KiB
Go
304 lines
6.6 KiB
Go
package input
|
|
|
|
import (
|
|
"errors"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/go-rod/rod"
|
|
"github.com/go-rod/rod/lib/launcher"
|
|
"github.com/google/uuid"
|
|
|
|
"git.jamestombleson.com/jtom38/newsbot-api/internal/entity"
|
|
"git.jamestombleson.com/jtom38/newsbot-api/internal/services/cache"
|
|
)
|
|
|
|
const (
|
|
FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/"
|
|
FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/"
|
|
|
|
FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM"
|
|
)
|
|
|
|
type FFXIVClient struct {
|
|
record entity.SourceEntity
|
|
//SourceID uint
|
|
//Url string
|
|
//Region string
|
|
|
|
cacheGroup string
|
|
}
|
|
|
|
func NewFFXIVClient(Record entity.SourceEntity) FFXIVClient {
|
|
return FFXIVClient{
|
|
record: Record,
|
|
cacheGroup: "ffxiv",
|
|
}
|
|
}
|
|
|
|
func (fc *FFXIVClient) CheckSource() ([]entity.ArticleEntity, error) {
|
|
var articles []entity.ArticleEntity
|
|
|
|
parser := fc.GetBrowser()
|
|
defer parser.Close()
|
|
|
|
links, err := fc.PullFeed(parser)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
cache := cache.NewCacheClient(fc.cacheGroup)
|
|
|
|
for _, link := range links {
|
|
// Check cache/db if this link has been seen already, skip
|
|
_, err := cache.FindByValue(link)
|
|
if err == nil {
|
|
continue
|
|
}
|
|
|
|
page := fc.GetPage(parser, link)
|
|
|
|
title, err := fc.ExtractTitle(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
thumb, err := fc.ExtractThumbnail(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
pubDate, err := fc.ExtractPubDate(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
description, err := fc.ExtractDescription(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
authorName, err := fc.ExtractAuthor(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
authorImage, err := fc.ExtractAuthorImage(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
tags, err := fc.ExtractTags(page)
|
|
if err != nil {
|
|
return articles, err
|
|
}
|
|
|
|
article := entity.ArticleEntity{
|
|
SourceID: fc.record.ID,
|
|
Tags: tags,
|
|
Title: title,
|
|
Url: link,
|
|
PubDate: pubDate,
|
|
Thumbnail: thumb,
|
|
Description: description,
|
|
AuthorName: authorName,
|
|
AuthorImageUrl: authorImage,
|
|
}
|
|
log.Printf("Collected '%v' from '%v'", article.Title, article.Url)
|
|
|
|
cache.Insert(uuid.New().String(), link)
|
|
|
|
articles = append(articles, article)
|
|
}
|
|
|
|
return articles, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) GetParser() (*goquery.Document, error) {
|
|
html, err := http.Get(fc.record.Url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer html.Body.Close()
|
|
|
|
doc, err := goquery.NewDocumentFromReader(html.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return doc, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) GetBrowser() *rod.Browser {
|
|
var browser *rod.Browser
|
|
if path, exists := launcher.LookPath(); exists {
|
|
u := launcher.New().Bin(path).MustLaunch()
|
|
browser = rod.New().ControlURL(u).MustConnect()
|
|
}
|
|
return browser
|
|
}
|
|
|
|
func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) {
|
|
var links []string
|
|
|
|
page := parser.MustPage(fc.record.Url)
|
|
defer page.Close()
|
|
|
|
// find the list by xpath
|
|
res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul")
|
|
|
|
// find all the li items
|
|
items := res.MustElements("li")
|
|
|
|
for _, item := range items {
|
|
// in each li, find the a items
|
|
a, err := item.Element("a")
|
|
if err != nil {
|
|
log.Println("Unable to find the a item, skipping")
|
|
continue
|
|
}
|
|
|
|
// find the href behind the a
|
|
url, err := a.Property("href")
|
|
if err != nil {
|
|
log.Println("Unable to find a href link, skipping")
|
|
continue
|
|
}
|
|
|
|
urlString := url.String()
|
|
isTopic := strings.Contains(urlString, "topics")
|
|
if isTopic {
|
|
links = append(links, urlString)
|
|
}
|
|
}
|
|
|
|
return links, nil
|
|
}
|
|
|
|
func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page {
|
|
page := parser.MustPage(url)
|
|
return page
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) {
|
|
thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String()
|
|
if thumbnail == "" {
|
|
return "", errors.New("unable to find thumbnail")
|
|
}
|
|
|
|
title := page.MustElement(".news__header > h1:nth-child(2)").MustText()
|
|
log.Println(title)
|
|
|
|
return thumbnail, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) {
|
|
stringDate := page.MustElement(".news__ic--topics").MustText()
|
|
if stringDate == "" {
|
|
return time.Now(), errors.New("unable to locate the publish date on the post")
|
|
}
|
|
|
|
PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate)
|
|
if err != nil {
|
|
return time.Now(), err
|
|
}
|
|
|
|
return PubDate, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) {
|
|
res := page.MustElement(".news__detail__wrapper").MustText()
|
|
if res == "" {
|
|
return "", errors.New("unable to locate the description on the post")
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > meta")
|
|
for _, item := range meta {
|
|
name, err := item.Property("name")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if name.String() != "author" {
|
|
continue
|
|
}
|
|
content, err := item.Property("content")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > meta")
|
|
for _, item := range meta {
|
|
name, err := item.Property("name")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if name.String() != "keywords" {
|
|
continue
|
|
}
|
|
content, err := item.Property("content")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) {
|
|
title, err := page.MustElement("head > title").Text()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if !strings.Contains(title, "|") {
|
|
return "", errors.New("unable to split the title, missing | in the string")
|
|
}
|
|
|
|
res := strings.Split(title, "|")
|
|
if title != "" {
|
|
return res[0], nil
|
|
}
|
|
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author on the page")
|
|
}
|
|
|
|
func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) {
|
|
meta := page.MustElements("head > link")
|
|
for _, item := range meta {
|
|
name, err := item.Property("rel")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if name.String() != "apple-touch-icon-precomposed" {
|
|
continue
|
|
}
|
|
content, err := item.Property("href")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return content.String(), nil
|
|
}
|
|
//log.Println(meta)
|
|
return "", errors.New("unable to find the author image on the page")
|
|
}
|