VIPnytt/RobotsTxtParser

View on GitHub
src/RobotsTxtInterface.php

Summary

Maintainability
A
0 mins
Test Coverage
F
0%
<?php
/**
 * vipnytt/RobotsTxtParser
 *
 * @link https://github.com/VIPnytt/RobotsTxtParser
 * @license https://github.com/VIPnytt/RobotsTxtParser/blob/master/LICENSE The MIT License (MIT)
 */

namespace vipnytt\RobotsTxtParser;

/**
 * Interface RobotsTxtInterface
 *
 * @package vipnytt\RobotsTxtParser
 */
interface RobotsTxtInterface
{
    /**
     * Robots.txt path
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#file-location--range-of-validity
     * @link https://tools.ietf.org/html/rfc3986
     * @link https://tools.ietf.org/html/rfc1808
     */
    const PATH = '/robots.txt';

    /**
     * Cache time
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#handling-http-result-codes
     */
    const CACHE_TIME = 86400;

    /**
     * Max redirects
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#handling-http-result-codes
     * @link https://tools.ietf.org/html/rfc1945
     */
    const MAX_REDIRECTS = 5;

    /**
     * Expected encoding
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#file-format
     * @link https://tools.ietf.org/html/rfc3986
     */
    const ENCODING = 'UTF-8';

    /**
     * Robots.txt max length in bytes
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#file-format
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#additional-info
     */
    const BYTE_LIMIT = 524288; // 512 kilobytes | 0.5 megabytes

    /**
     * Max rule length
     *
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#clean-param
     */
    const MAX_LENGTH_RULE = 500;

    /**
     * Default User-Agent
     *
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#user-agent
     */
    const USER_AGENT = '*';

    /**
     * Directive: Allow
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#allow
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#allow-disallow
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.allow
     * @link http://www.robotstxt.org/norobots-rfc.txt
     */
    const DIRECTIVE_ALLOW = 'allow';

    /**
     * Directive: Cache-delay
     *
     * Unofficial
     * Used as an crawl-delay alternative specifically for caching purposes.
     */
    const DIRECTIVE_CACHE_DELAY = 'cache-delay';

    /**
     * Directive: Clean-param
     *
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#clean-param
     */
    const DIRECTIVE_CLEAN_PARAM = 'clean-param';

    /**
     * Directive: Comment
     *
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.comment
     */
    const DIRECTIVE_COMMENT = 'comment';

    /**
     * Directive: Crawl-delay
     *
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#crawl-delay
     */
    const DIRECTIVE_CRAWL_DELAY = 'crawl-delay';

    /**
     * Directive: Disallow
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#disallow
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#allow-disallow
     * @link https://www.w3.org/TR/html4/appendix/notes.html#h-B.4.1.1
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.disallow
     * @link http://www.robotstxt.org/norobots-rfc.txt
     * @link http://www.robotstxt.org/orig.html
     */
    const DIRECTIVE_DISALLOW = 'disallow';

    /**
     * Directive: Host
     *
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#host
     * @link https://tools.ietf.org/html/rfc952
     */
    const DIRECTIVE_HOST = 'host';

    /**
     * Directive: NoIndex
     */
    const DIRECTIVE_NO_INDEX = 'noindex';

    /**
     * Directive: RequestClient-rate
     *
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.request-rate
     */
    const DIRECTIVE_REQUEST_RATE = 'request-rate';

    /**
     * Directive: Robot-version
     *
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.robot-version
     */
    const DIRECTIVE_ROBOT_VERSION = 'robot-version';

    /**
     * Directive: Sitemap
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#sitemap
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#sitemap
     * @link http://www.sitemaps.org/protocol.html#submit_robots
     */
    const DIRECTIVE_SITEMAP = 'sitemap';

    /**
     * Directive: User-Agent
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#order-of-precedence-for-user-agents
     * @link https://yandex.com/support/webmaster/controlling-robot/robots-txt.xml#user-agent
     * @link https://www.w3.org/TR/html4/appendix/notes.html#h-B.4.1.1
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.user-agent
     * @link http://www.robotstxt.org/norobots-rfc.txt
     * @link http://www.robotstxt.org/orig.html
     */
    const DIRECTIVE_USER_AGENT = 'user-agent';

    /**
     * Directive: Visit-time
     *
     * @link http://www.conman.org/people/spc/robots2.html#format.directives.visit-time
     */
    const DIRECTIVE_VISIT_TIME = 'visit-time';

    /**
     * Directive aliases (for simple errors / typos)
     *
     * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#file-format
     */
    const ALIAS_DIRECTIVES = [
        'cachedelay' => self::DIRECTIVE_CACHE_DELAY,
        'cleanparam' => self::DIRECTIVE_CLEAN_PARAM,
        'crawldelay' => self::DIRECTIVE_CRAWL_DELAY,
        'no-index' => self::DIRECTIVE_NO_INDEX,
        'requestrate' => self::DIRECTIVE_REQUEST_RATE,
        'robotversion' => self::DIRECTIVE_ROBOT_VERSION,
        'useragent' => self::DIRECTIVE_USER_AGENT,
        'visittime' => self::DIRECTIVE_VISIT_TIME,
    ];
}