ReanGD/go-web-search

View on GitHub
crawler/hosts_manager.go

Summary

Maintainability
A
0 mins
Test Coverage
package crawler

// status: ok
import (
    "database/sql"
    "io/ioutil"
    "net/http"
    "net/url"

    "github.com/ReanGD/go-web-search/proxy"
    "github.com/ReanGD/go-web-search/werrors"
    "github.com/temoto/robotstxt-go"
    "github.com/uber-go/zap"
)

type hostsManager struct {
    robotsTxt map[int64]*robotstxt.Group
    hosts     map[string]int64
}

// ResolveHost - find host id
func (m *hostsManager) ResolveHost(hostName string) sql.NullInt64 {
    hostID, ok := m.hosts[NormalizeHostName(hostName)]
    return sql.NullInt64{Int64: hostID, Valid: ok}
}

// CheckURL - check URL by robots.txt
func (m *hostsManager) CheckURL(u *url.URL) (sql.NullInt64, bool) {
    hostID := m.ResolveHost(u.Host)
    if !hostID.Valid {
        return hostID, false
    }

    copyURL := *u
    copyURL.Scheme = ""
    copyURL.Host = ""
    return hostID, m.robotsTxt[hostID.Int64].Test(copyURL.String())
}

// GetHosts - get list of hosts
func (m *hostsManager) GetHosts() map[string]int64 {
    return m.hosts
}

func (m *hostsManager) initByDb(db proxy.DbHost) error {
    hosts, err := db.GetHosts()
    if err != nil {
        return err
    }
    for id, host := range hosts {
        hostName := host.GetName()
        robot, err := robotstxt.FromStatusAndBytes(host.GetRobotsTxt())
        if err != nil {
            return werrors.NewFields(ErrCreateRobotsTxtFromDb,
                zap.String("host", hostName),
                zap.String("details", err.Error()))
        }
        m.hosts[hostName] = id
        m.robotsTxt[id] = robot.FindGroup("Googlebot")
    }

    return nil
}

func (m *hostsManager) resolveURL(hostName string) (string, error) {
    hostURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName})
    response, err := http.Get(hostURL)
    if err == nil {
        err = response.Body.Close()
        if response.StatusCode != 200 {
            return "", werrors.NewFields(ErrResolveBaseURL,
                zap.Int("status_code", response.StatusCode),
                zap.String("url", hostURL))
        }
    }
    if err != nil {
        return "", werrors.NewFields(ErrGetRequest,
            zap.String("details", err.Error()),
            zap.String("url", hostURL))
    }

    return response.Request.URL.String(), nil
}

func (m *hostsManager) readRobotTxt(hostName string) (int, []byte, error) {
    var body []byte
    robotsURL := NormalizeURL(&url.URL{Scheme: "http", Host: hostName, Path: "robots.txt"})
    response, err := http.Get(robotsURL)
    if err == nil {
        body, err = ioutil.ReadAll(response.Body)
        closeErr := response.Body.Close()
        if err == nil {
            err = closeErr
        }
    }

    if err != nil {
        return 0, body, werrors.NewFields(ErrGetRequest,
            zap.String("details", err.Error()),
            zap.String("url", robotsURL))
    }

    return response.StatusCode, body, nil
}

func (m *hostsManager) initByHostName(db proxy.DbHost, hostName string) error {
    baseURL, err := m.resolveURL(hostName)
    if err != nil {
        return err
    }

    statusCode, body, err := m.readRobotTxt(hostName)
    if err != nil {
        return err
    }

    robot, err := robotstxt.FromStatusAndBytes(statusCode, body)
    if err != nil {
        return werrors.NewDetails(ErrCreateRobotsTxtFromURL, err)
    }

    host := proxy.NewHost(hostName, statusCode, body)
    hostID, err := db.AddHost(host, baseURL)

    if err == nil {
        m.hosts[hostName] = hostID
        m.robotsTxt[hostID] = robot.FindGroup("Googlebot")
    }

    return err
}

// Init - init host manager
func (m *hostsManager) Init(db proxy.DbHost, baseHosts []string) error {
    m.robotsTxt = make(map[int64]*robotstxt.Group, len(baseHosts))
    m.hosts = make(map[string]int64, len(baseHosts))

    err := m.initByDb(db)
    if err != nil {
        return err
    }

    for _, hostNameRaw := range baseHosts {
        hostName := NormalizeHostName(hostNameRaw)
        _, exists := m.hosts[hostName]
        if !exists {
            err = m.initByHostName(db, hostName)
            if err != nil {
                return werrors.AddFields(err, zap.String("host", hostName))
            }
        }
    }

    return nil
}