jidaikobo-shibata/a11yc

View on GitHub
classes/Crawl.php

Summary

Maintainability
A
0 mins
Test Coverage
<?php
/**
 * A11yc\Crawl
 *
 * @package    part of A11yc
 * @author     Jidaikobo Inc.
 * @license    The MIT License (MIT)
 * @copyright  Jidaikobo Inc.
 * @link       http://www.jidaikobo.com
 */
namespace A11yc;

class Crawl
{
    /**
     * is target webpage
     *
     * @param String  $url
     * @return Bool
     */
    public static function isTargetMime($url)
    {
        Guzzle::forge($url);

        if (isset(Guzzle::instance($url)->headers['Content-Type'][0]))
        {
            $mime = Guzzle::instance($url)->headers['Content-Type'][0];
            if (strpos($mime, ';') !== false)
            {
                $mime = substr($mime, 0 ,strrpos($mime, ';'));
            }

            return in_array($mime, Values::targetMimes());
        }
        return false;
    }

    /**
     * is page exist
     *
     * @param String $url
     * @return Bool
     */
    public static function isPageExist($url)
    {
        if (Guzzle::envCheck() === false) return false;
        Guzzle::forge($url);
        return Guzzle::instance($url)->is_exists;
    }

    protected static $target_path = '';
    protected static $target_path_raw = '';
    protected static $real_urls = array();

    /**
     * set target path
     *
     * @param String $target_path
     * @return Void
     */
    public static function setTargetPath($target_path)
    {
        static::$target_path = static::removeFilename($target_path);
        static::$target_path_raw = $target_path;
    }

    /**
     * get target path
     *
     * @param Bool $is_raw
     * @return  string
     */
    public static function getTargetPath($is_raw = false)
    {
        if ($is_raw) return static::$target_path_raw;
        return static::$target_path;
    }

    /**
     * is valid scheme
     *
     * @param String $url
     * @return Bool
     */
    public static function isValidScheme($url)
    {
        $url = Util::urldec($url);

        // scheme
        $scheme = substr($url, 0, strpos($url, ':'));

        // do not contain "mailto" because mailto doesn't return header.
        $schemes = array(
            'http', 'https', 'file', 'ftp', 'gopher', 'news',
            'nntp', 'telnet', 'wais', 'prospero'
        );

        return in_array($scheme, $schemes);
    }

    /**
     * remove_filename
     *
     * @param String $url
     * @return String
     */
    public static function removeFilename($url)
    {
        $url = trim($url);

        // is end with slash?
        $is_not_file = false;
        if (mb_substr($url, -1) == '/')
        {
            $is_not_file = true;
        }

        // is end with file?
        if ( ! $is_not_file)
        {
            $ext = mb_substr($url, strrpos($url, '.') + 1);
            if ($ext && preg_match('/htm|php|pl|cgi/i', $ext))
            {
                $url = dirname($url);
            }
        }

        return Util::urldec($url);
    }

    /**
     * upperPath
     *
     * @param String $str
     * @return String
     */
    public static function upperPath($str)
    {
        if (empty(static::$target_path)) Util::error('set set_target_path()');

        // count ups
        $num = substr_count($str, '../');
        $ret = '';
        for ($n = 1; $n <= $num; $n++)
        {
            $ret = dirname(static::$target_path);
        }

        // is page exists?
        $headers = @get_headers($ret);
        if (
            $headers !== false &&
            (
                strpos($headers[0], ' 20') !== false ||
                strpos($headers[0], ' 30') !== false
            )
        )
        {
            return $ret;
        }

        return static::$target_path;
    }

    /**
     * get host from url
     *
     * @param String $url
     * @return String|Bool
     */
    public static function getHostFromUrl($url)
    {
        // cannot get host
        if (substr($url, 0, 4) != 'http')
        {
            return false;
        }
        // plural /
        else if (substr_count($url, '/') >= 3)
        {
            $hosts = explode('/', $url);
            return $hosts[0].$hosts[1].'//'.$hosts[2];
        }

        // maybe host
        return rtrim($url, '/');
    }

    /**
     * real_url
     * if this function returns false, real url is not exists.
     * therefore if this returns sting (url), then target url returned 200.
     *
     * @param String $url
     * @return String
     */
    public static function real_url($url)
    {
        if (isset(static::$real_urls[$url])) return static::$real_urls[$url];

        Guzzle::forge($url);
        $real_url = Guzzle::instance($url)->real_url;

        if ($real_url)
        {
            static::$real_urls[$url] = $real_url;
        }
        // it maybe guzzle's trouble
        elseif ( ! empty($url))
        {
            static::$real_urls[$url] = $url;
        }

        return isset(static::$real_urls[$url]) ? static::$real_urls[$url] : $url;
    }
}