meifamily/ptt-alertor

View on GitHub
ptt/web/crawler.go

Summary

Maintainability
C
1 day
Test Coverage
package web

import (
    "errors"
    "net/http"
    "net/url"
    "time"

    log "github.com/Ptt-Alertor/logrus"

    "github.com/Ptt-Alertor/ptt-alertor/models/article"
    pttHttp "github.com/Ptt-Alertor/ptt-alertor/ptt/http"

    "regexp"

    "strings"

    "strconv"

    "github.com/Ptt-Alertor/ptt-alertor/models/pushsum"
    "golang.org/x/net/html"
)

const pttHostURL = "https://www.ptt.cc"

// CurrentPage find Board Last Page Number
func CurrentPage(board string) (int, error) {
    url := makeBoardURL(board, -1)
    htmlNodes, err := fetchHTML(url)
    if err != nil {
        return 0, err
    }
    paging := findNodes(htmlNodes, findPagingBlock)
    for _, page := range paging {
        anchors := findNodes(page, findAnchor)
        for _, a := range anchors {
            if strings.Contains(a.FirstChild.Data, "上頁") {
                link := getAnchorLink(a)
                if link == "" {
                    return 1, nil
                }
                re := regexp.MustCompile("\\d+")
                page, err := strconv.Atoi(re.FindString(link))
                if err != nil {
                    return 0, err
                }
                return page + 1, nil
            }
        }
    }
    return 0, errors.New("Parse Current Page Error")
}

// FetchArticles makes board's index articles to a article slice
func FetchArticles(board string, page int) (articles article.Articles, err error) {
    reqURL := makeBoardURL(board, page)
    htmlNodes, err := fetchHTML(reqURL)
    if err != nil {
        return nil, err
    }
    articleBlocks := findNodes(htmlNodes, findArticleBlocks)
    for _, articleBlock := range articleBlocks {
        article := article.Article{}
        for _, pushCountDiv := range findNodes(articleBlock, findPushCountDiv) {
            if child := pushCountDiv.FirstChild; child != nil {
                if child := child.FirstChild; child != nil {
                    if err == nil {
                        article.PushSum = pushsum.ConvertPushCount(child.Data)
                    }
                }
            }
        }
        for _, titleDiv := range findNodes(articleBlock, findTitleDiv) {

            anchors := findNodes(titleDiv, findAnchor)

            if len(anchors) == 0 {
                article.Title = titleDiv.FirstChild.Data
                article.Link = ""
                continue
            }

            for _, anchor := range findNodes(titleDiv, findAnchor) {
                article.Title = anchor.FirstChild.Data
                link := pttHostURL + getAnchorLink(anchor)
                article.Link = link
                article.ID = article.ParseID(link)
            }
        }
        for _, metaDiv := range findNodes(articleBlock, findMetaDiv) {
            for _, date := range findNodes(metaDiv, findDateDiv) {
                article.Date = strings.TrimSpace(date.FirstChild.Data)
            }
            for _, author := range findNodes(metaDiv, findAuthorDiv) {
                article.Author = author.FirstChild.Data
            }
        }
        articles = append(articles, article)
        if isLastArticleBlock(articleBlock) {
            break
        }
    }
    return articles, nil
}

func isLastArticleBlock(articleBlock *html.Node) bool {
    for next := articleBlock.NextSibling; ; next = next.NextSibling {
        if next == nil {
            break
        }
        if next.Type == html.ElementNode {
            for _, attr := range next.Attr {
                if attr.Val == "r-list-sep" {
                    return true
                }
                return false
            }
        }
    }
    return false
}

// FetchArticle build article object from html
func FetchArticle(board, articleCode string) (article.Article, error) {
    reqURL := makeArticleURL(board, articleCode)
    htmlNodes, err := fetchHTML(reqURL)
    if err != nil {
        return article.Article{}, err
    }
    atcl := article.Article{
        Link:  reqURL,
        Code:  articleCode,
        Board: board,
    }
    nodes := findNodes(htmlNodes, findOgTitleMeta)
    if len(nodes) > 0 {
        atcl.Title = getMetaContent(nodes[0])
    } else {
        atcl.Title = "[內文標題已被刪除]"
    }
    atcl.ID = atcl.ParseID(reqURL)
    pushBlocks := findNodes(htmlNodes, findPushBlocks)
    pushes := []article.Comment{}

PushBlocks:
    for _, pushBlock := range pushBlocks {
        push := article.Comment{}
        for _, pushIPDateTime := range findNodes(pushBlock, findPushIPDateTime) {
            ipdatetime := strings.TrimSpace(pushIPDateTime.FirstChild.Data)
            if ipdatetime == "" {
                continue PushBlocks
            }
            dateTime, err := parseDateTime(ipdatetime)
            if err != nil {
                log.WithFields(log.Fields{
                    "ipdatetime": ipdatetime,
                    "board":      board,
                    "code":       articleCode,
                }).WithError(err).Warn("Parse DateTime Error")
                continue PushBlocks
            }
            push.DateTime = dateTime
        }
        for _, pushTag := range findNodes(pushBlock, findPushTag) {
            push.Tag = pushTag.FirstChild.Data
        }
        for _, pushUserID := range findNodes(pushBlock, findPushUserID) {
            push.UserID = pushUserID.FirstChild.Data
        }
        for _, pushContent := range findNodes(pushBlock, findPushContent) {
            content := pushContent.FirstChild.Data
            for n := pushContent.FirstChild.NextSibling; n != nil; n = n.NextSibling {
                if findEmailProtected(n) != nil {
                    break
                }
                if n.FirstChild != nil {
                    content += n.FirstChild.Data
                }
                if n.NextSibling != nil {
                    content += n.NextSibling.Data
                }
            }
            push.Content = content
        }
        pushes = append(pushes, push)
    }
    if len(pushes) > 0 {
        atcl.LastPushDateTime = pushes[len(pushes)-1].DateTime
    }
    atcl.Comments = pushes
    return atcl, nil
}

func parseDateTime(ipdatetime string) (time.Time, error) {
    re, _ := regexp.Compile("(\\d+\\.\\d+\\.\\d+\\.\\d+)?\\s*(.*)")
    subMatches := re.FindStringSubmatch(ipdatetime)
    dateTime := strings.TrimSpace(subMatches[len(subMatches)-1])
    loc := time.FixedZone("CST", 8*60*60)
    t, err := time.ParseInLocation("01/02 15:04", dateTime, loc)
    if err != nil {
        return time.Time{}, err
    }
    t = t.AddDate(getYear(t), 0, 0)
    return t, nil
}

func getYear(commentTime time.Time) int {
    t := time.Now()
    commentTime = commentTime.AddDate(t.Year(), 0, 0)
    if commentTime.After(t) {
        return t.Year() - 1
    }
    return t.Year()
}

// CheckBoardExist use for checking board exist or not
func CheckBoardExist(board string) bool {
    return checkURLExist(makeBoardURL(board, -1))
}

// CheckArticleExist user for checking article exist or not
func CheckArticleExist(board, articleCode string) bool {
    return checkURLExist(makeArticleURL(board, articleCode))
}

func checkURLExist(url string) bool {
    resp, err := http.Get(url)
    if err != nil {
        return false
    }
    defer resp.Body.Close()
    if resp.StatusCode == http.StatusOK {
        return true
    }
    return false
}

func makeBoardURL(board string, page int) string {
    var pageStr string
    if page < 0 {
        pageStr = ""
    } else {
        pageStr = strconv.Itoa(page)
    }
    return pttHostURL + "/bbs/" + board + "/index" + pageStr + ".html"
}

func makeArticleURL(board, articleCode string) string {
    return pttHostURL + "/bbs/" + board + "/" + articleCode + ".html"
}

// URLNotFoundError is an error type present 404 Not Found
type URLNotFoundError struct {
    URL string
}

func (u URLNotFoundError) Error() string {
    return "Fetched URL Not Found"
}

var errRedirect = errors.New("Redirect")

var client = &http.Client{
    CheckRedirect: func(req *http.Request, via []*http.Request) error {
        return errRedirect
    },
    Timeout: 30 * time.Second,
}

func fetchHTML(reqURL string) (doc *html.Node, err error) {
    req, err := pttHttp.HttpRequest(reqURL)
    if err != nil {
        return nil, err
    }
    resp, err := client.Do(req)
    if err != nil && resp == nil {
        log.WithField("url", reqURL).WithError(err).Error("Fetch URL Failed")
        return nil, err
    }
    defer resp.Body.Close()

    if err == nil && resp.StatusCode == http.StatusNotFound {
        err = URLNotFoundError{reqURL}
        return nil, err
    }

    if uerr, ok := err.(*url.Error); ok && uerr.Err == errRedirect {
        req := passR18(reqURL)
        resp, err = client.Do(req)
        if err != nil {
            return nil, err
        }
    }

    doc, err = html.Parse(resp.Body)
    if err != nil {
        log.WithError(err).Error("Crawler Fetch HTML Failed")
        return nil, err
    }
    return doc, nil
}

func passR18(reqURL string) (req *http.Request) {

    req, _ = http.NewRequest("GET", reqURL, nil)

    over18Cookie := http.Cookie{
        Name:       "over18",
        Value:      "1",
        Domain:     "www.ptt.cc",
        Path:       "/",
        RawExpires: "Session",
        MaxAge:     0,
        HttpOnly:   false,
    }

    req.AddCookie(&over18Cookie)

    return req
}