spotweb/spotweb

View on GitHub
lib/services/Providers/Services_Providers_HttpNzb.php

Summary

Maintainability
D
2 days
Test Coverage
<?php
/*
 * FIXME
 * XXX
 * TODO
 *
 * Need to use the standard Services_Providers_Http class
 *
 */
/**
 * This class is used to find alternate download urls for nzb's.
 */
class Services_Providers_HttpNzb
{
    protected $spot = null;
    protected $alternateDownloadUrl = null;
    protected $nzb = null;
    protected $_cacheDao = null;

    public function __construct(array $spot, Dao_Cache $cacheDao)
    {
        $this->spot = $spot;
        $this->_cacheDao = $cacheDao;
    }

    public function hasNzb()
    {
        // Check if we can get an nzb.
        if ($this->getNzb()) {
            return true;
        }

        return false;
    }

    /**
     * Returns nzb file in xml format.
     */
    public function getNzb()
    {
        if ($this->nzb) {
            // \O/ We already found an nzb before. Return the xml!
            return $this->nzb;
        } elseif ($this->nzb === false) {
            // We already did a http request and this results in a badly formed xml.
            return null;
        }

        // Get the alternate url.
        $url = $this->getUrlForSpot();

        // If there is no alternate url return;
        if (!$url) {
            SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->getnzb(), failed to find url for spot');

            $this->nzb = false;

            return null;
        }

        SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->getnzb(), url: '.$url);

        // Initialize http class.
        $svcHttp = new Services_Providers_Http($this->_cacheDao);
        $result = $svcHttp->perform($url);

        // Check if any error occured
        if ($result['successful']) {
            // Load the body into simplexml.
            // If the xml is well formed this will result in true thus returning the xml.
            // Suppress errors if the string is not well formed, where testing here.
            $body = $result['data'];
            if (@simplexml_load_string($body)) {
                SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->getnzb(), found nzb');

                $this->nzb = $body;

                return $this->nzb;
            } elseif ($body) {
                SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->getnzb(), found body but not a valid XML');

                // we did not get a direct link to an nzb file.
                // more parsing is needed t(*_*t)
                $this->nzb = $this->downloadNzbFrom($url, $body);

                return $this->nzb;
            } else {
                $this->nzb = false;
            }
        } else {
            trigger_error($result['errorstr']);
        }

        // Return nothing we have a http error.
        return null;
    }

    /**
     * Check if we have an url and return it if there is one.
     *
     * @return string returns the url or null
     */
    public function getUrlForSpot()
    {
        if (!$this->hasUrlForSpot()) {
            $this->nzb = false;

            return null;
        }

        return $this->alternateDownloadUrl;
    }

    /**
     * Check for specific string to check if we have an alternate download url.
     */
    public function hasUrlForSpot()
    {
        if ($this->alternateDownloadUrl) {
            return true;
        }

        // Array containing url matches. Must contain the first part of the url.
        $matches = [
            'http://base64.derefer.me',
            'http://derefer.me',
            'http://alturl.com',
            'http://tiny.cc',
            'http://bit.ly',
            'http://goo.gl',
            'http://hideref.org',
            'http://tiny.cc',
            'http://www.dereferer.org',
        ];

        // Search in the website url
        if (isset($this->spot['website'])) {
            SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), website = '.$this->spot['website']);

            foreach ($matches as $needle) {
                SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), website needle = '.$needle);

                if (strpos($this->spot['website'], $needle) !== false) {
                    SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), website match');

                    // Stop search we have a match
                    $this->alternateDownloadUrl = $this->resolveUrl($this->spot['website']);
                    if ($this->alternateDownloadUrl === false) {
                        return false;
                    } else {
                        return true;
                    } // else
                }
            }
        }

        // We have no alternate yet lets spider the description.
        if (isset($this->spot['description'])) {
            foreach ($matches as $needle) {
                SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), content looking for '.$needle);

                if (strpos($this->spot['description'], $needle) !== false) {
                    SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), needle match');

                    // Stop search we have a match, get the url from the description
                    $url = false;
                    preg_match('/\>('.str_replace('/', '\/', preg_quote($needle)).'.*)\</', $this->spot['description'], $matches);

                    if (isset($matches[1])) {
                        $url = $matches[1];
                    } else {
                        /*
                         * We get the body before it has been converted
                         * to HTML from the UBB code. We try to extract
                         * the URL from the body anyway
                         */
                        preg_match("(([^=])((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*))", $this->spot['description'], $matches);
                        if (isset($matches[2])) {
                            $url = $matches[2];
                        } // if
                    } // else

                    SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->hasUrlForSpot(), needle match, url: '.$url);

                    if ($url) {
                        $this->alternateDownloadUrl = $this->resolveUrl($url);

                        if ($this->alternateDownloadUrl === false) {
                            return false;
                        } else {
                            return true;
                        } // else
                    }
                }
            }
        }

        $this->nzb = false;

        return false;
    }

    /**
     * Find the alternate url.
     *
     * @param $url String containing alternate url.
     *
     * @return bool could the file be resolved?
     */
    protected function resolveUrl($url)
    {
        SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->resolveUrl(), url='.$url);

        // Initialize download retrieval class
        $svcHttp = new Services_Providers_Http($this->_cacheDao);
        $result = $svcHttp->perform($url);

        // Check if any error occured
        if (!$result['successful']) {
            SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->resolveUrl(), not succesful='.$result['errorstr']);

            return false;
        } // if

        // Execute (save responce to var for manual following redirects)
        return $result['finalurl'];
    }

    /**
     * Cases for calling the specific parse methods.
     *
     * @param string $url
     * @param string $body
     *
     * @return bool|mixed
     */
    protected function downloadNzbFrom($url, $body)
    {
        SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->downloadNzbfrom() '.$url);

        // Binsearch
        if (strpos($url, 'binsearch.info') !== false) {
            return $this->downloadNzbFromBinsearch($url, $body);
        }

        // NZB Index
        if ((strpos($url, 'nzbindex.nl') !== false) || (strpos($url, 'nzbindex.com') !== false)) {
            // This function does not use the $body var.
            return $this->downloadNzbFromNzbindex($url);
        }

        // NZB Club
        if (strpos($url, 'nzbclub.com') !== false) {
            // This function does not use the $body var.
            return $this->downloadNzbFromNzbclub($url);
        }

        // NZB search
        if (strpos($url, 'nzbsearch.net') != false) {
            return $this->downloadNzbFromNzbSearch($url, $body);
        } // nzbsearch

        // No support found return ;(
        return false;
    }

    /**
     * Tries to download the actual nzb from binsearch.
     *
     * @param string $url
     * @param string $body
     *
     * @return bool|mixed
     */
    protected function downloadNzbFromBinsearch($url, $body)
    {
        // Match to get the nzb id.
        preg_match('/\q\=([a-z0-9]*)&*/i', $url, $matches);

        // This match is essential for the download
        if ((!count($matches)) && (!empty($matches[1]))) {
            return false;
        }

        // Hardcoded download url.
        $downloadUrl = 'http://www.binsearch.info/fcgi/nzb.fcgi?q='.$matches[1];

        $dom = new DOMDocument();

        // Suppress errors, html does not have to be well formed to function.
        @$dom->loadHTML($body);
        $ids = [];

        // Fetch table rows from the result page.
        foreach ($dom->getElementsByTagName('tr') as $tr) {
            // Only continue parsing if the search query is found in the tr.
            if (strpos($tr->nodeValue, $matches[1]) !== false) {
                // Get all input fields.
                $fields = $tr->getElementsByTagName('input');

                // Check type, we need the checkbox :)
                foreach ($fields as $input) {
                    if ($input->getAttribute('type') == 'checkbox') {
                        // walk up the DOM tree and check if the next element has the string in the name.
                        // this way we only have the download rows left.
                        if (
                            $input->parentNode &&
                            $input->parentNode->nextSibling
                            && strpos($input->parentNode->nextSibling->nodeValue, $matches[1]) !== false
                        ) {
                            // Push name to array. This name is needed to fetch the download.
                            $ids[] = $input->getAttribute('name');
                        }
                    }
                }
            }
        }

        // Fetch the last id assuming our download was the first to post.
        // This step is to prevent accidental porn downloads.
        $id = null;
        if (count($ids)) {
            $id = array_pop($ids);
        }

        // Withoud an id where not going to be able to get the nzb.
        if (!$id) {
            return false;
        }

        $postdata = [
            'action' => 'nzb',
            $id      => $id,
        ];

        return $this->postAndDownloadNzb($downloadUrl, $postdata);
    }

    /**
     * Execute a POST to the given url and return the body.
     *
     * @param string $url
     * @param array  $postdata
     *
     * @return bool|mixed
     */
    protected function postAndDownloadNzb($url, array $postdata)
    {
        // Initialize download retrieval class
        $svcHttp = new Services_Providers_Http($this->_cacheDao);
        $svcHttp->setPostContent($postdata);
        $svcHttp->setMethod('POST');
        $result = $svcHttp->perform($url);

        // Check if any error occured
        if (!$result['successful']) {
            SpotDebug::msg(SpotDebug::DEBUG, __CLASS__.'->postAndDownloadNzb(), not succesful='.$result['errorstr']);

            return false;
        } // if

        // Load the body into simplexml.
        // If the xml is well formed this will result in true thus returning the xml.
        // Suppress errors if the string is not well formed, where testing here.
        if (@simplexml_load_string($result['data'])) {
            return $result['data'];
        } else {
            return false;
        } // else
    }

    /**
     * Tries to download the actual nzb from nzbindex.
     *
     * @param string $url
     *
     * @internal param String $body
     *
     * @return bool|mixed
     */
    protected function downloadNzbFromNzbindex($url)
    {
        // New http request to get the page again
        // This time do a request with the cookie that accepts the Disclaimer agreement.

        // Initialize download retrieval class
        $svcHttp = new Services_Providers_Http($this->_cacheDao);
        $svcHttp->setCookie('agreed=true');
        $result = $svcHttp->perform($url);

        // Check if any error occured
        if (!$result['successful']) {
            trigger_error($result['errorstr']);

            return false;
        }

        // Match to get the nzb id.
        preg_match('/\q\=([a-z0-9]*)/i', $url, $matches);

        // This match is essential for the download
        if (!count($matches)) {
            return false;
        }

        $dom = new DOMDocument();

        // Suppress errors, html does not have to be well formed to function.
        $body = $result['data'];
        @$dom->loadHTML($body);

        // Fetch a tags from the result page.
        foreach ($dom->getElementsByTagName('a') as $a) {
            // Search for the direct nzb download link :)
            if (trim(strtolower($a->nodeValue)) == 'download') {
                $url = $a->getAttribute('href');

                if (!empty($matches[1])) {
                    if (strpos($url, $matches[1]) !== false) {
                        // We just found a direct download link for our url.
                        // No need for posting, just another get using http.
                        return $this->getAndDownloadNzb($url);
                    }
                } // if
            } // if
        } // foreach

        // Could not find the right download link.
        return false;
    }

    /**
     * Curl GET return xml or false if not well formed.
     *
     * @param string $url
     *
     * @return bool|mixed
     */
    protected function getAndDownloadNzb($url)
    {
        // Initialize download retrieval class
        $svcHttp = new Services_Providers_Http($this->_cacheDao);
        $result = $svcHttp->perform($url);

        // Check if any error occured
        if (!$result['successful']) {
            trigger_error($result['errorstr']);

            return false;
        }

        // Load the body into simplexml.
        // If the xml is well formed this will result in true thus returning the xml.
        // Suppress errors if the string is not well formed, where testing here.
        $body = $result['data'];
        if (@simplexml_load_string($body)) {
            return $body;
        } else {
            return false;
        }

        return false;
    }

    /**
     * Tries to download the actual nzb from nzbclub.
     *
     * @param string $url
     *
     * @return bool|mixed
     */
    protected function downloadNzbFromNzbclub($url)
    {
        $downloadUrl = str_replace('nzb_view', 'nzb_get', $url).'nzb';

        return $this->getAndDownloadNzb($downloadUrl);
    }

    /**
     * Tries to download the actual nzb from nzbindex.
     *
     * @param string $url
     *
     * @internal param String $body
     *
     * @return bool|mixed
     */
    protected function downloadNzbFromNzbSearch($url, $body)
    {
        // Match to get the nzb id.
        preg_match('/nzb_get.aspx\?mid=([a-zA-Z0-9]*)/i', $body, $matches);

        // This match is essential for the download
        if (!count($matches)) {
            return false;
        }

        return $this->getAndDownloadNzb('http://www.nzbsearch.net/'.$matches[0]);
    }
}