newsbot-api/services/ffxiv.go

package services

import (
	"errors"
	"log"
	"net/http"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/go-rod/rod"
	"github.com/google/uuid"

	"github.com/jtom38/newsbot/collector/domain/model"
	"github.com/jtom38/newsbot/collector/services/cache"
)

const (
	FFXIV_NA_FEED_URL string = "https://na.finalfantasyxiv.com/lodestone/"
	FFXIV_JP_FEED_URL string = "https://jp.finalfantasyxiv.com/lodestone/"

	FFXIV_TIME_FORMAT string = "1/2/2006 3:4 PM"
)

type FFXIVClient struct {
	SourceID uint
	Url string
	Region string

	cacheGroup string
}

func NewFFXIVClient(region string) FFXIVClient {
	var url string

	switch region {
		case "na":
		url = FFXIV_NA_FEED_URL
		case "jp":
		url = FFXIV_JP_FEED_URL
	}

	return FFXIVClient{
		Region: region,
		Url: url,
		cacheGroup: "ffxiv",
	}
}

func (fc *FFXIVClient) CheckSource() ([]model.Articles, error) {
	var articles []model.Articles

	parser := fc.GetBrowser()
	defer parser.Close()

	links, err := fc.PullFeed(parser)
	if err != nil { return articles, err }

	cache := cache.NewCacheClient(fc.cacheGroup)

	for _, link := range links {
		// Check cache/db if this link has been seen already, skip
		_, err := cache.FindByValue(link)
		if err == nil { continue }


		page := fc.GetPage(parser, link)

		title, err := fc.ExtractTitle(page)
		if err != nil { return articles, err }

		thumb, err := fc.ExtractThumbnail(page)
		if err != nil { return articles, err }

		pubDate, err := fc.ExtractPubDate(page)
		if err != nil { return articles, err }

		description, err := fc.ExtractDescription(page)
		if err != nil { return articles, err }

		authorName, err := fc.ExtractAuthor(page)
		if err != nil { return articles, err }

		authorImage, err := fc.ExtractAuthorImage(page)
		if err != nil { return articles, err }

		tags, err := fc.ExtractTags(page)
		if err != nil { return articles, err }

		article := model.Articles{
			SourceID: fc.SourceID,
			Tags: tags,
			Title: title,
			Url: link,
			PubDate: pubDate,
			Video: "",
			VideoHeight: 0,
			VideoWidth: 0,
			Thumbnail: thumb,
			Description: description,
			AuthorName: authorName,
			AuthorImage: authorImage,
		}
		log.Printf("Collected '%v' from '%v'", article.Title, article.Url)

		cache.Insert(uuid.New().String(), link)

		articles = append(articles, article)
	}

	return articles, nil
}

func (fc *FFXIVClient) GetParser() (*goquery.Document, error) {
	html, err := http.Get(fc.Url)
	if err != nil { return nil, err }
	defer html.Body.Close()

	doc, err := goquery.NewDocumentFromReader(html.Body)
	if err != nil { return nil, err }
	return doc, nil
}

func (fc *FFXIVClient) GetBrowser() (*rod.Browser) {
	browser := rod.New().MustConnect()
	return browser
}

func (fc *FFXIVClient) PullFeed(parser *rod.Browser) ([]string, error) {
	var links []string

	page := parser.MustPage(fc.Url)
	defer page.Close()

	// find the list by xpath
	res := page.MustElementX("/html/body/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/ul")

	// find all the li items
	items := res.MustElements("li")

	for _, item := range items {
		// in each li, find the a items
		a, err := item.Element("a")
		if err != nil {
			log.Println("Unable to find the a item, skipping")
			continue
		}

		// find the href behind the a
		url, err := a.Property("href")
		if err != nil {
			log.Println("Unable to find a href link, skipping")
			continue
		}

		urlString := url.String()
		isTopic := strings.Contains(urlString, "topics")
		if isTopic {
			links = append(links, urlString)
		}
	}

	return links, nil
}

func (rc *FFXIVClient) GetPage(parser *rod.Browser, url string) *rod.Page {
	page := parser.MustPage(url)
	return page
}

func (fc *FFXIVClient) ExtractThumbnail(page *rod.Page) (string, error) {
	thumbnail := page.MustElementX("/html/body/div[3]/div[2]/div[1]/article/div[1]/img").MustProperty("src").String()
	if thumbnail == "" { return "", errors.New("unable to find thumbnail")}

	title := page.MustElement(".news__header > h1:nth-child(2)").MustText()
	log.Println(title)

	return thumbnail, nil
}

func (fc *FFXIVClient) ExtractPubDate(page *rod.Page) (time.Time, error) {
	stringDate := page.MustElement(".news__ic--topics").MustText()
	if stringDate == "" { return time.Now(), errors.New("unable to locate the publish date on the post")}

	PubDate, err := time.Parse(FFXIV_TIME_FORMAT, stringDate)
	if err != nil { return time.Now(), err }

	return PubDate, nil
}

func (fc *FFXIVClient) ExtractDescription(page *rod.Page) (string, error) {
	res := page.MustElement(".news__detail__wrapper").MustText()
	if res == "" { return "", errors.New("unable to locate the description on the post")}

	return res, nil
}

func (fc *FFXIVClient) ExtractAuthor(page *rod.Page) (string, error) {
	meta := page.MustElements("head > meta")
	for _, item := range meta {
		name, err := item.Property("name")
		if err != nil { return "", err }

		if name.String() != "author" { continue }
		content, err := item.Property("content")
		if err != nil { return "", err }

		return content.String(), nil
	}
	//log.Println(meta)
	return "", errors.New("unable to find the author on the page")
}

func (fc *FFXIVClient) ExtractTags(page *rod.Page) (string, error) {
	meta := page.MustElements("head > meta")
	for _, item := range meta {
		name, err := item.Property("name")
		if err != nil { return "", err }

		if name.String() != "keywords" { continue }
		content, err := item.Property("content")
		if err != nil { return "", err }

		return content.String(), nil
	}
	//log.Println(meta)
	return "", errors.New("unable to find the author on the page")
}

func (fc *FFXIVClient) ExtractTitle(page *rod.Page) (string, error) {
	title, err := page.MustElement("head > title").Text()
	if err != nil { return "", err }

	if !strings.Contains(title, "|") { return "", errors.New("unable to split the title, missing | in the string")}

	res := strings.Split(title, "|")
	if title != "" { return res[0], nil }

	//log.Println(meta)
	return "", errors.New("unable to find the author on the page")
}

func (fc *FFXIVClient) ExtractAuthorImage(page *rod.Page) (string, error) {
	meta := page.MustElements("head > link")
	for _, item := range meta {
		name, err := item.Property("rel")
		if err != nil { return "", err }

		if name.String() != "apple-touch-icon-precomposed" { continue }
		content, err := item.Property("href")
		if err != nil { return "", err }

		return content.String(), nil
	}
	//log.Println(meta)
	return "", errors.New("unable to find the author image on the page")
}