package services import ( "database/sql" "errors" "fmt" "log" "net/http" //"strconv" "github.com/PuerkitoBio/goquery" "github.com/go-rod/rod" "github.com/mmcdole/gofeed" "github.com/jtom38/newsbot/collector/database" ) type YoutubeClient struct { record database.Source // internal variables at time of collection channelID string avatarUri string // config //debug bool // cache config cacheGroup string } var ( // This is a local slice to store what URI's have been seen to remove extra calls to the DB YoutubeUriCache []*string ErrYoutubeChannelIdMissing = errors.New("unable to find the channelId on the requested page") ) const YOUTUBE_FEED_URL string = "https://www.youtube.com/feeds/videos.xml?channel_id=" func NewYoutubeClient(Record database.Source) YoutubeClient { yc := YoutubeClient{ record: Record, cacheGroup: "youtube", } /* cc := NewConfigClient() debug, err := strconv.ParseBool(cc.GetConfig(YOUTUBE_DEBUG)) if err != nil { panic("'YOUTUBE_DEBUG' was not a bool value")} yc.Config.Debug = debug */ return yc } // CheckSource will go and run all the commands needed to process a source. func (yc *YoutubeClient) GetContent() ([]database.Article, error) { var items []database.Article docParser, err := yc.GetParser(yc.record.Url) if err != nil { return items, err } // Check cache/db for existing value // If we have the value, skip //channelId, err := yc.extractChannelId() channelId, err := yc.GetChannelId(docParser) if err != nil { return items, err } if channelId == "" { return items, ErrYoutubeChannelIdMissing } yc.channelID = channelId // Check the cache/db forthe value. // if we have the value, skip avatar, err := yc.GetAvatarUri() if err != nil { return items, err } if avatar == "" { return items, ErrMissingAuthorImage } yc.avatarUri = avatar feed, err := yc.PullFeed() if err != nil { return items, err } newPosts, err := yc.CheckForNewPosts(feed) if err != nil { return items, err } for _, item := range newPosts { article := yc.ConvertToArticle(item) items = append(items, article) YoutubeUriCache = append(YoutubeUriCache, &item.Link) // Add the post to local cache log.Println(article) } return items, nil } func (yc *YoutubeClient) GetBrowser() *rod.Browser { browser := rod.New().MustConnect() return browser } func (yc *YoutubeClient) GetPage(parser *rod.Browser, url string) *rod.Page { page := parser.MustPage(url) return page } func (yc *YoutubeClient) GetParser(uri string) (*goquery.Document, error) { html, err := http.Get(uri) if err != nil { log.Println(err) } defer html.Body.Close() doc, err := goquery.NewDocumentFromReader(html.Body) if err != nil { return nil, err } return doc, nil } // This pulls the youtube page and finds the ChannelID. // This value is required to generate the RSS feed URI func (yc *YoutubeClient) GetChannelId(doc *goquery.Document) (string, error) { meta := doc.Find("meta") for _, item := range meta.Nodes { if item.Attr[0].Val == "channelId" { yc.channelID = item.Attr[1].Val return yc.channelID, nil } } return "", ErrYoutubeChannelIdMissing } // This pulls the youtube page and finds the ChannelID. // This value is required to generate the RSS feed URI //func (yc *YoutubeClient) extractChannelId(page *rod.Page) (string, error) { //} // This will parse the page to find the current Avatar of the channel. func (yc *YoutubeClient) GetAvatarUri() (string, error) { var AvatarUri string browser := rod.New().MustConnect() page := browser.MustPage(yc.record.Url) res := page.MustElement("#channel-header-container > yt-img-shadow:nth-child(1) > img:nth-child(1)").MustAttribute("src") if *res == "" || res == nil { return AvatarUri, ErrMissingAuthorImage } AvatarUri = *res defer browser.Close() defer page.Close() return AvatarUri, nil } // This will parse and look for the tags that has been defined by the user. func (yc *YoutubeClient) GetTags(parser *goquery.Document) (string, error) { meta := parser.Find("meta") for _, item := range meta.Nodes { if item.Attr[0].Val == "keywords" { res := item.Attr[1].Val return res, nil } } return "", ErrMissingTags } func (yc *YoutubeClient) GetVideoThumbnail(parser *goquery.Document) (string, error) { meta := parser.Find("meta") for _, item := range meta.Nodes { if item.Attr[0].Val == "og:image" { res := item.Attr[1].Val return res, nil } } return "", ErrMissingThumbnail } // This will pull the RSS feed items and return the results func (yc *YoutubeClient) PullFeed() (*gofeed.Feed, error) { feedUri := fmt.Sprintf("%v%v", YOUTUBE_FEED_URL, yc.channelID) fp := gofeed.NewParser() feed, err := fp.ParseURL(feedUri) if err != nil { return nil, err } return feed, nil } // CheckForNewPosts will talk to the Database to see if it has a record for the posts that have been extracted. // If the post does not exist, it will be added. func (yc *YoutubeClient) CheckForNewPosts(feed *gofeed.Feed) ([]*gofeed.Item, error) { var newPosts []*gofeed.Item for _, item := range feed.Items { // Check the cache/db to see if this URI has been seen already uriExists := yc.CheckUriCache(&item.Link) if uriExists { continue } //TODO Check the DB if the cache is not aware //TODO If the db knew about it, append it to the local cache // if its new, append it. newPosts = append(newPosts, item) } return newPosts, nil } func (yc *YoutubeClient) CheckUriCache(uri *string) bool { for _, item := range YoutubeUriCache { if item == uri { return true } } return false } func (yc *YoutubeClient) ConvertToArticle(item *gofeed.Item) database.Article { parser, err := yc.GetParser(item.Link) if err != nil { log.Printf("Unable to process %v, submit this link as an issue.\n", item.Link) } tags, err := yc.GetTags(parser) if err != nil { msg := fmt.Sprintf("%v. %v", err, item.Link) log.Println(msg) } thumb, err := yc.GetVideoThumbnail(parser) if err != nil { msg := fmt.Sprintf("%v. %v", err, item.Link) log.Println(msg) } var article = database.Article{ Sourceid: yc.record.ID, Tags: tags, Title: item.Title, Url: item.Link, Pubdate: *item.PublishedParsed, Thumbnail: thumb, Description: item.Description, Authorname: sql.NullString{String: item.Author.Name}, Authorimage: sql.NullString{String: yc.avatarUri}, } return article }