serp-spider/search-engine-google

View on GitHub
src/Parser/Evaluated/Rule/Natural/Classical/ClassicalResult.php

Summary

Maintainability
A
2 hrs
Test Coverage
<?php
/**
 * @license see LICENSE
 */

namespace Serps\SearchEngine\Google\Parser\Evaluated\Rule\Natural\Classical;

use Serps\Core\Dom\DomElement;
use Serps\Core\Media\MediaFactory;
use Serps\SearchEngine\Google\Exception\InvalidDOMException;
use Serps\SearchEngine\Google\Page\GoogleDom;
use Serps\Core\Serp\BaseResult;
use Serps\Core\Serp\IndexedResultSet;
use Serps\SearchEngine\Google\Parser\ParsingRuleInterface;
use Serps\SearchEngine\Google\NaturalResultType;

class ClassicalResult implements ParsingRuleInterface
{

    public function match(GoogleDom $dom, DomElement $node)
    {
        if ($node->getAttribute('class') == 'g') {
            if ($dom->cssQuery('.rc', $node)->length == 1) {
                return self::RULE_MATCH_MATCHED;
            }
        }
        return self::RULE_MATCH_NOMATCH;
    }

    protected function parseNode(GoogleDom $dom, \DomElement $node)
    {

        // find the title/url
        /* @var $aTag \DOMElement */
        $aTag = $dom
            ->xpathQuery("descendant::*[(self::div or self::h3) and @class='r'][1]/a", $node)
            ->item(0);
        if (!$aTag) {
            throw new InvalidDOMException('Cannot parse a classical result.');
        }

        /* @var $h3Tag \DOMElement */
        $h3Tag = $dom
            ->xpathQuery('descendant::h3', $node)
            ->item(0);
        if (!$h3Tag) {
            throw new InvalidDOMException('Cannot parse a classical result.');
        }

        $destinationTag = $dom
            ->cssQuery('div.f cite, div.TbwUpd cite', $node)
            ->getNodeAt(0);

        if (is_a($destinationTag, Serps\Core\Dom\NullDomNode::class)) {
            throw new InvalidDOMException('Cannot parse a classical result.');
        }

        $descriptionTag = $dom
            ->xpathQuery("descendant::span[@class='st']", $node)
            ->item(0);

        return [
            'title'   => $h3Tag->nodeValue,
            'url'     => $dom->getUrl()->resolveAsString($aTag->getAttribute('href')),
            'destination' => $destinationTag->getNodeValue(),
            // trim needed for mobile results coming with an initial space
            'description' => $descriptionTag ? trim($descriptionTag->nodeValue) : null,
            'isAmp' => function () use ($dom, $node) {
                return $dom
                        ->cssQuery('.amp_r', $node)
                        ->length > 0;
            },
        ];
    }

    /**
     * If isLarge() matched, this will parse the content of site links
     * @param GoogleDom $dom
     * @param \DomElement $node
     * @return \Closure
     */
    protected function parseSiteLink(GoogleDom $dom, \DomElement $node)
    {
        return function () use ($dom, $node) {
            $items = $dom->cssQuery('.mslg .sld', $node);
            $siteLinksData = [];
            foreach ($items as $item) {
                $siteLinksData[] = new BaseResult(NaturalResultType::CLASSICAL_SITELINK, [
                    'title' => function () use ($dom, $item) {
                        return $dom->cssQuery('h3.r a', $item)
                            ->getNodeAt(0)
                            ->getNodeValue();
                    },
                    'description' => function () use ($dom, $item) {
                        return $dom->cssQuery('.st', $item)
                            ->getNodeAt(0)
                            ->getNodeValue();
                    },
                    'url' => function () use ($dom, $item) {
                        return $dom->cssQuery('h3.r a', $item)
                            ->getNodeAt(0)
                            ->getAttribute('href');
                    },
                ]);
            }
            return $siteLinksData;
        };
    }

    /**
     * Check if has site links. Might be overriden by subparser like ClassicalCard
     * @param GoogleDom $dom
     * @param \DomElement $node
     * @return bool
     */
    protected function isLarge(GoogleDom $dom, \DomElement $node)
    {
        return $dom->cssQuery('.nrgt', $node)->length == 1;
    }

    public function parse(GoogleDom $dom, \DomElement $node, IndexedResultSet $resultSet)
    {
        $data = $this->parseNode($dom, $node);

        $resultTypes = [NaturalResultType::CLASSICAL];


        // CLASSICAL RESULT MIGHT BE ENLARGED WITH SITELINKS
        if ($this->isLarge($dom, $node)) {
            $data['sitelinks'] = $this->parseSiteLink($dom, $node);
            $resultTypes[] = NaturalResultType::CLASSICAL_LARGE;
        }


        // classical result can have a video thumbnail
        $thumb = $dom->getXpath()
            ->query("descendant::g-img[@class='_ygd']/img", $node)
            ->item(0);

        if ($thumb) {
            $resultTypes[] = NaturalResultType::CLASSICAL_ILLUSTRATED;

            $data['thumb'] = function () use ($thumb) {
                if ($thumb) {
                    return MediaFactory::createMediaFromSrc($thumb->getAttribute('src'));
                } else {
                    return null;
                }
            };
        }

        $videoDuration = $dom->cssQuery('.vdur', $node);
        if ($videoDuration->length == 1) {
            $resultTypes[] = NaturalResultType::CLASSICAL_VIDEO;
            $data['videoLarge'] = false;
        }


        $item = new BaseResult($resultTypes, $data);
        $resultSet->addItem($item);
    }
}