spotweb/spotweb

View on GitHub
lib/services/Format/Services_Format_Parsing.php

Summary

Maintainability
F
3 days
Test Coverage
<?php

class Services_Format_Parsing
{
    private $_spotSigning = null;
    private $_util;

    public function __construct()
    {
        $this->_spotSigning = Services_Signing_Base::factory();
        $this->_util = new Services_Format_Util();
    }

    // ctor

    /*
     * Some Spotnet clients create invalid XML - see
     * messageid ZOB4WPyqQfcHqykUAES8q@spot.net for example, because
     * it uses an unescaped & not in an CDATA block.
     */
    private function correctElmContents($xmlStr, $elems)
    {
        $cdataStart = '<![CDATA[';
        $cdataEnd = ']]>';

        /*
         * replace low-ascii characters, see messageid KNCuzvnxJJErJibUAAxQJ@spot.net
         */
        $xmlStr = preg_replace('/[\x00-\x1F]/', '', $xmlStr);

        /* and loop through all elements and fix them up */
        foreach ($elems as $elementName) {
            // find the element entries
            $startElem = stripos($xmlStr, '<'.$elementName.'>');
            $endElem = stripos($xmlStr, '</'.$elementName.'>');

            if (($startElem === false) || ($endElem === false)) {
                continue;
            }

            /*
             * Make sure this elements content is not preceeded by the
             * required CDATA header
             */
            if (substr($xmlStr, $startElem + strlen($elementName) + 2, strlen($cdataStart)) !== $cdataStart) {
                $xmlStr = str_replace(
                    ['<'.$elementName.'>', '</'.$elementName.'>'],
                    ['<'.$elementName.'>'.$cdataStart, $cdataEnd.'</'.$elementName.'>'],
                    $xmlStr
                );
            } // if
        } // foreach

        return $xmlStr;
    }

    // correctElmContents

    /*
     * Make string utf8mb3 for mysql (only 3 byte utf codes)
     */
    private function replace4Byte($string, $replacement = '')
    {
        return preg_replace('%(?:
          \xF0[\x90-\xBF][\x80-\xBF]{2}      # planes 1-3
        | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
        | \xF4[\x80-\x8F][\x80-\xBF]{2}      # plane 16
    )%xs', $replacement, $string);
    }

    /*
     * Parse a full Spot according to the XML structure
     */
    public function parseFull($xmlStr)
    {
        // Create a template array so we always have the full fields to prevent ugly notices
        $tpl_spot = ['category' => '', 'website' => '', 'image' => '', 'sabnzbdurl' => '', 'messageid' => '', 'searchurl' => '', 'description' => '',
            'sub'               => '', 'filesize' => '', 'poster' => '', 'tag' => '', 'nzb' => [], 'title' => '',
            'filename'          => '', 'newsgroup' => '', 'subcata' => '', 'subcatb' => '',
            'subcatc'           => '', 'subcatd' => '', 'subcatz' => '', 'created' => '', 'key' => '', 'prevMsgids' => [], 'newsreader' => '', ];

        /*
         * Some legacy spotNet clients create incorrect/invalid multiple segments,
         * we use this crude way to workaround this. GH issue #1608
         */
        if (strpos($xmlStr, 'spot.net></Segment') !== false) {
            $xmlStr = str_replace(
                ['spot.net></Segment>', 'spot.ne</Segment>'],
                ['spot.net</Segment>', 'spot.net</Segment>'],
                $xmlStr
            );
        } // if

        /*
         * Fix up some forgotten entity encoding / cdata sections in the XML
         */
        $xmlStr = $this->correctElmContents($xmlStr, ['Title', 'Description', 'Image', 'Tag', 'Website']);

        /*
         * Supress errors for corrupt messageids, eg: <evoCgYpLlLkWe97TQAmnV@spot.net>
         */
        $xmltop = @(new SimpleXMLElement($xmlStr));
        $xml = $xmltop->Posting;
        $tpl_spot['created'] = (string) $xml->Created;
        $tpl_spot['key'] = (string) $xml->Key;
        $tpl_spot['category'] = (string) $xml->Category;
        $tpl_spot['website'] = (string) $xml->Website;
        $tpl_spot['description'] = (string) $xml->Description;
        $tpl_spot['filesize'] = (string) $xml->Size;
        $tpl_spot['poster'] = (string) mb_convert_encoding($xml->Poster, 'UTF-8', 'ISO-8859-1');
        $tpl_spot['tag'] = (string) mb_convert_encoding($xml->Tag, 'UTF-8', 'ISO-8859-1');
        $tpl_spot['title'] = (string) $xml->Title;

        // Decode HTML special characters, title otherwise search will be broken, description as body in newsgroup
        $tpl_spot['title'] = html_entity_decode($tpl_spot['title'], ENT_QUOTES, 'UTF-8');
        $tpl_spot['title'] = $this->replace4Byte($tpl_spot['title'], '??');
        $tpl_spot['description'] = html_entity_decode($tpl_spot['description'], ENT_QUOTES, 'UTF-8');
        $tpl_spot['description'] = $this->replace4Byte($tpl_spot['description'], '??');

        // FTD spots have the filename
        if (!empty($xml->Filename)) {
            $tpl_spot['filename'] = (string) $xml->Filename;
        } // if

        // FTD spots have the newsgroup
        if (!empty($xml->Newsgroup)) {
            $tpl_spot['newsgroup'] = (string) $xml->newsgroup;
        } // if

        /*
         * Images available can be in the XML in two different ways.
         *
         * Some older spots just have an URL we can use, newer spots
         * have an height/width/messageid(s) pair we use to retrieve the image
         * from
         */
        if (empty($xml->Image->Segment)) {
            $tpl_spot['image'] = (string) $xml->Image;
        } else {
            $tpl_spot['image'] = [
                'height' => (string) $xml->Image['Height'],
                'width'  => (string) $xml->Image['Width'],
            ];

            foreach ($xml->xpath('/Spotnet/Posting/Image/Segment') as $seg) {
                // Make sure the messageid's are valid so we do not throw an NNTP error
                if (!$this->_util->validMessageId((string) $seg)) {
                    $tpl_spot['image']['segment'] = [];
                    break;
                } else {
                    $tpl_spot['image']['segment'][] = (string) $seg;
                } // if
            } // foreach
        } // else

        // Just stitch together the NZB segments
        foreach ($xml->xpath('/Spotnet/Posting/NZB/Segment') as $seg) {
            if (!$this->_util->validMessageId((string) $seg)) {
                $tpl_spot['nzb'] = [];
                break;
            } else {
                $tpl_spot['nzb'][] = (string) $seg;
            } // else
        } // foreach

        // PREVSPOTS
        if (!empty($xml->PREVSPOTS->Spot)) {
            foreach ($xml->xpath('/Spotnet/Posting/PREVSPOTS/Spot') as $seg) {
                // Make sure the messageid's are valid so we do not throw an NNTP error
                if ($this->_util->validMessageId((string) $seg)) {
                    $tpl_spot['prevMsgids'][] = (string) $seg;
                } // if
            } // foreach
        } // else

        // Extra / newsreader
        if (!empty($xmltop->Extra->Newsreader)) {
            $tpl_spot['newsreader'] = (string) $xmltop->Extra->Newsreader;
        }

        // fix the category in the XML array but only for new spots
        if ((int) $xml->Key != 1) {
            $tpl_spot['category'] = ((int) $tpl_spot['category']) - 1;
        } // if

        /*
         * For FTD spots an array of subcategories is created. This array is not
         * compatible with that of newer spots so we need two seperate codepaths
         */
        $subcatList = [];

        /*
         * We fix up the category list later in the system, so we just extract the
         * list of subcategories
         */
        if (!empty($xml->SubCat)) {
            foreach ($xml->xpath('/Spotnet/Posting/Category/SubCat') as $sub) {
                $subcatList[] = (string) $sub;
            } // foreach
        } else {
            foreach ($xml->xpath('/Spotnet/Posting/Category/Sub') as $sub) {
                $subcatList[] = (string) $sub;
            } // foreach
        } // if

        /*
         * Mangle the several types of subcategory listing to make sure we only
         * have to use one type in the rest of Spotwb
         */
        foreach ($subcatList as $subcat) {
            if (preg_match('/(\d+)([aAbBcCdDzZ])(\d+)/', preg_quote($subcat), $tmpMatches)) {
                $subCatVal = strtolower($tmpMatches[2]).((int) $tmpMatches[3]);
                $tpl_spot['subcat'.$subCatVal[0]] .= $subCatVal.'|';
            } // if
        } // foreach

        /*
         * subcatz is a subcategory introduced in later Spotnet formats, we prefer to
         * always have this subcategory so we just fake it if it's not listed.
         */
        if (empty($tpl_spot['subcatz'])) {
            $tpl_spot['subcatz'] = SpotCategories::createSubcatZ($tpl_spot['category'], $tpl_spot['subcata'].$tpl_spot['subcatb'].$tpl_spot['subcatd']);
        } // if

        // map deprecated genre categories to their new genre category
        $tpl_spot['subcatd'] = SpotCategories::mapDeprecatedGenreSubCategories($tpl_spot['category'], $tpl_spot['subcatd'], $tpl_spot['subcatz']);
        $tpl_spot['subcatc'] = SpotCategories::mapLanguageSubCategories($tpl_spot['category'], $tpl_spot['subcatc'], $tpl_spot['subcatz']);

        // and return the parsed XML
        return $tpl_spot;
    }

    // parseFull()

    /*
     * Parse a Spot using only the header information
     */
    public function parseHeader($subj, $from, $date, $messageid, $rsaKeys)
    {
        // Initialize an empty array, we create a basic template in a few
        $spot = [];

        /*
         * The "From" header is created using the following system:
         *
         *   From: [Nickname] <[RANDOM or PUBLICKEY]@[CAT][KEY-ID][SUBCAT].[SIZE].[RANDOM].[DATE].[CUSTOM-ID].[CUSTOM-VALUE].[SIGNATURE]>
         *        or
         *   From: [Nickname] <[PUBLICKEY-MODULO.USERSIGNATURE]@[CAT][KEY-ID][SUBCAT].[SIZE].[RANDOM].[DATE].[CUSTOM-ID].[CUSTOM-VALUE].[SIGNATURE]>
         *
         *
         * First we want to extract everything after the @ but because a nickname could contain an @, we have to mangle it a bit
         */
        $fromInfoPos = strpos($from, '<');
        if ($fromInfoPos === false) {
            return false;
        } else {
            // Remove the posters' name and the <> characters
            $fromAddress = explode('@', substr($from, $fromInfoPos + 1, -1));
            if (count($fromAddress) < 2) {
                return false;
            } // if
            $spot['header'] = $fromAddress[1];

            /*
             * It is possible the part before the @ contains both the
             * users' signature as the spots signature as signed by the user
             */
            $headerSignatureTemp = explode('.', $fromAddress[0]);
            $spot['selfsignedpubkey'] = $this->_util->spotUnprepareBase64($headerSignatureTemp[0]);
            if (isset($headerSignatureTemp[1])) {
                $spot['user-signature'] = $this->_util->spotUnprepareBase64($headerSignatureTemp[1]);
            } // if
        } // if

        /*
         * Initialize some basic variables. We set 'verified' to false so we  can
         * exit this function at any time and the gathered data for this spot up til
         * then is stil ignored.
         */
        $spot['verified'] = false;
        $spot['filesize'] = 0;
        $spot['messageid'] = $messageid;
        $spot['stamp'] = strtotime($date);

        /*
         * Split the .-delimited fields into an array so we can mangle it. We require
         * atleast six fields, if any less we can safely assume the spot is invalid
         */
        $fields = explode('.', $spot['header']);
        if (count($fields) < 6) {
            return false;
        } // if

        /*
         * Extract the fixed fields from the header
         */
        $spot['poster'] = substr($from, 0, $fromInfoPos - 1);
        $spot['category'] = (int) substr($fields[0], 0, 1) - 1.0;
        $spot['keyid'] = (int) substr($fields[0], 1, 1);
        $spot['filesize'] = $fields[1];
        $spot['subcata'] = '';
        $spot['subcatb'] = '';
        $spot['subcatc'] = '';
        $spot['subcatd'] = '';
        $spot['subcatz'] = '';
        $spot['wassigned'] = false;
        $spot['spotterid'] = '';
        $isRecentKey = $spot['keyid'] != 1;

        /*
         * If the keyid is invalid, abort trying to parse it
         */
        if ($spot['keyid'] < 0) {
            return false;
        } // if

        /*
         * Listings of subcategories is dependent on the age of the spot.
         *
         * FTD spots just list all subcategories like: a9b4c0d5d15d11
         * Newer spots always use three characters for each subcategory like: a09b04c00d05d15d11.
         *
         * We really do not care for this, we just parse them using the same code as the
         * first one.
         *
         * We pad $strCatList with an extra set of tokes so we always parse te last category,
         * we make sure any sanitycheck is passed by adding 3 tokens.
         */
        $strCatList = strtolower(substr($fields[0], 2)).'!!!';
        $strCatListLen = strlen($strCatList);

        /*
         * Initialize some basic variables to use for sanitychecking (eg: valid subcats)
         */
        $validSubcats = ['a' => true, 'b' => true, 'c' => true, 'd' => true, 'z' => true];
        $tmpCatBuild = '';

        /* And just try to extract all given subcategories */
        for ($i = 0; $i < $strCatListLen; $i++) {
            /*
             * If the current character is not an number, we found the next
             * subcategory. Add the current one to the list, and start
             * parsing the new one
             */
            if ((!is_numeric($strCatList[$i])) && (!empty($tmpCatBuild))) {
                if (isset($validSubcats[$tmpCatBuild[0]])) {
                    $spot['subcat'.$tmpCatBuild[0]] .= $tmpCatBuild[0].(int) substr($tmpCatBuild, 1).'|';
                } // if

                $tmpCatBuild = '';
            } // if

            $tmpCatBuild .= $strCatList[$i];
        } // for

        /*
         * subcatz is a subcategory introduced in later Spotnet formats, we prefer to
         * always have this subcategory so we just fake it if it's not listed.
         */
        if (empty($spot['subcatz'])) {
            $spot['subcatz'] = SpotCategories::createSubcatz($spot['category'], $spot['subcata'].$spot['subcatb'].$spot['subcatd']);
        } // if

        // map deprecated genre categories to their new genre category
        $spot['subcatd'] = SpotCategories::mapDeprecatedGenreSubCategories($spot['category'], $spot['subcatd'], $spot['subcatz']);
        $spot['subcatc'] = SpotCategories::mapLanguageSubCategories($spot['category'], $spot['subcatc'], $spot['subcatz']);

        if ((strpos($subj, '=?') !== false) && (strpos($subj, '?=') !== false)) {
            // This is an old format to parse, instantiate the legacy parsing
            $legacyParser = new Services_Format_ParsingLegacy();

            // Make sure its as simple as possible
            $subj = str_replace('?= =?', '?==?', $subj);
            $subj = str_replace('\r', '', trim($legacyParser->oldEncodingParse($subj)));
            $subj = str_replace('\n', '', $subj);
        } // if

        if ($isRecentKey) {
            $tmp = explode('|', $subj);

            $spot['title'] = trim($tmp[0]);
            if (count($tmp) > 1) {
                $spot['tag'] = trim($tmp[1]);
            } else {
                $spot['tag'] = '';
            } // else
        } else {
            $tmp = explode('|', $subj);
            if (count($tmp) <= 1) {
                $tmp = [$subj];
            } // if

            $spot['tag'] = trim($tmp[count($tmp) - 1]);

            // remove the tags from the array
            array_pop($tmp);
            array_pop($tmp);

            $spot['title'] = trim(implode('|', $tmp));

            if ((strpos($spot['title'], chr(0xC2)) !== false) | (strpos($spot['title'], chr(0xC3)) !== false)) {
                // This is an old format to parse, instantiate the legacy parsing
                $legacyParser = new Services_Format_ParsingLegacy();
                $spot['title'] = trim($legacyParser->oldEncodingParse($spot['title']));
            } // if
        } // if recentKey

        // Title and poster fields are mandatory, we require it to validate the signature
        if ((strlen($spot['title']) == 0) || (strlen($spot['poster']) == 0)) {
            return $spot;
        } // if

        /*
         * For any recentkey ( >1) or spots created after year-2010, we require the spot
         * to be signed
         */
        $mustbeSigned = $isRecentKey | ($spot['stamp'] > 1293870080);
        if ($mustbeSigned) {
            $spot['headersign'] = $fields[count($fields) - 1];
            $spot['wassigned'] = (strlen($spot['headersign']) != 0);
        } // if must be signed
        else {
            $spot['verified'] = true;
            $spot['wassigned'] = false;
        } // if doesnt need to be signed, pretend that it is

        /*
         * Don't verify spots which are already verified
         */
        if ($spot['wassigned']) {
            /*
             * There are currently two known methods to which Spots are signed,
             * each having different charachteristics, making it a bit difficult
             * to work with this.
             *
             * The oldest method uses a secret private key and a signing server, we
             * name this method SPOTSIGN_V1. The users' public key is only available
             * in the XML header, not in the From header. This is the preferred method.
             *
             * The second method uses a so-called "self signed" spot (the spotter signs
             * the spots, posts the public key in the header and a hashcash is used to
             * prevent spamming). This method is called SPOTSIGN_V2.
             *
             */
            if ($spot['keyid'] == 7) {
                /*
                 * KeyID 7 has a special meaning, it defines a self-signed spot and
                 * requires a hashcash
                 */
                $signingMethod = 2;
            } else {
                $signingMethod = 1;
            } // else

            switch ($signingMethod) {
                case 1:
                    // the signature this header is signed with
                    $signature = $this->_util->spotUnprepareBase64($spot['headersign']);

                    /*
                     * Make sure the key specified is an actual known key
                     */
                    if (isset($rsaKeys[$spot['keyid']])) {
                        if ($spot['keyid'] == 2 && $spot['filesize'] = 999 && strlen($spot['selfsignedpubkey']) > 50
                        ) {
                            /* Check personal dispose message */
                            $signature = $this->_util->spotUnprepareBase64($spot['headersign']);
                            $userSignedHash = sha1('<'.$spot['messageid'].'>', false);
                            $spot['verified'] = (substr($userSignedHash, 0, 4) === '0000');
                            if ($spot['verified']) {
                                $userRsaKey = [2 => ['modulo' => $spot['selfsignedpubkey'], 'exponent' => 'AQAB']];
                                if ($this->_spotSigning->verifySpotHeader($spot, $signature, $userRsaKey)) {
                                    $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
                                } // if
                            } // if
                        } else {
                            $spot['verified'] = $this->_spotSigning->verifySpotHeader($spot, $signature, $rsaKeys);
                        }
                    } // if

                    break;
                    // SPOTSIGN_V1

                case 2:
                    // the signature this header is signed with
                    $signature = $this->_util->spotUnprepareBase64($spot['headersign']);

                    $userSignedHash = sha1('<'.$spot['messageid'].'>', false);
                    $spot['verified'] = (substr($userSignedHash, 0, 4) === '0000');

                    /*
                     * Create a fake RSA keyarray so we can validate it using our standard
                     * infrastructure
                     */
                    if ($spot['verified']) {
                        $userRsaKey = [7 => ['modulo' => $spot['selfsignedpubkey'],
                            'exponent'                => 'AQAB', ]];

                        /*
                         * We cannot use this as a full measure to check the spot's validness yet,
                         * because at least one Spotnet client feeds us invalid data for now
                         */
                        if ($this->_spotSigning->verifySpotHeader($spot, $signature, $userRsaKey)) {
                            /*
                             * The users' public key (modulo) is posted in the header, lets
                             * try this.
                             */
                            $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
                        } // if
                    } // if

                    break;
                 // SPOTSIGN_V2
            } // switch

            /*
             * Even more recent spots, contain the users' full publickey
             * in the header. This allows us to uniquely identify and verify
             * the poster of the spot.
             *
             * Try to extract this information.
             */
            if ($spot['verified'] && (!empty($spot['user-signature'])) && (!empty($spot['selfsignedpubkey']))) {
                /*
                 * Extract the public key
                 */
                $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
                $spot['user-key'] = ['modulo' => $spot['selfsignedpubkey'],
                    'exponent'                => 'AQAB', ];
                /*
                 * The spot contains the signature in the header of the spot
                 */
                $spot['verified'] = $this->_spotSigning->verifyFullSpot($spot);
            } // if
        } // if was signed

        /*
         * We convert the title and other fields to UTF8, we cannot
         * do this any earlier because it would break the RSA signature
         */
        if (($spot !== false) && $spot['verified']) {
            $spot['title'] = mb_convert_encoding($spot['title'], 'UTF-8', 'ISO-8859-1');
            $spot['poster'] = mb_convert_encoding($spot['poster'], 'UTF-8', 'ISO-8859-1');
            $spot['tag'] = mb_convert_encoding($spot['tag'], 'UTF-8', 'ISO-8859-1');

            // If a spot is in the future, fix it
            if (time() < $spot['stamp']) {
                $spot['stamp'] = time();
            } // if
        } // if

        return $spot;
    }

    // parseHeader
} // class Services_Format_Parsing