bkdotcom/CssXpath

View on GitHub
src/CssSelect.php

Summary

Maintainability
A
25 mins
Test Coverage
A
100%
<?php

/**
 * This file is part of CssXpath
 *
 * @package   CssXPath
 * @author    Brad Kent <bkfake-github@yahoo.com>
 * @license   http://opensource.org/licenses/MIT MIT
 * @copyright 2018-2023 Brad Kent
 * @version   1.0
 *
 * @link http://www.github.com/bkdotcom/CssXpath
 */

namespace bdk\CssXpath;

use bdk\CssXpath\CssXpath;
use DOMDocument;
use DOMElement;
use DOMNodeList;
use DOMXpath;

/**
 * CSS selector class
 *
 * Originally based on https://github.com/tj/php-selector
 *
 * Use as an instance
 *    $cssSelect = new CssSelect($html)
 *    $nodes = $cssSelect->select('a.nifty');
 *
 * Use statically
 *    $nodes = CssSelect::select($html, 'a.nifty');
 *
 * @method select(string $selector, bool $asDomList)
 * @method static select($html, string $selector, bool $asDomList)
 */
class CssSelect
{
    protected $domXpath;

    /**
     * Constructor
     *
     * @param string|DOMDocument $html HTML string or DOMDocument object
     */
    public function __construct($html = '')
    {
        $this->setHtml($html);
    }

    /**
     * Magic method
     *
     * Used to access select() non-statically
     *
     * @param string $name      method name
     * @param array  $arguments method arguments
     *
     * @return mixed
     */
    public function __call($name, $arguments)
    {
        if ($name === 'select') {
            return \call_user_func_array(array($this, 'selectNonStatic'), $arguments);
        }
    }

    /**
     * Magic method
     *
     * Used to access select() statically
     *
     * @param string $name      method name
     * @param array  $arguments method arguments
     *
     * @return mixed
     */
    public static function __callStatic($name, $arguments)
    {
        if ($name === 'select') {
            return \call_user_func_array(array(__CLASS__, 'selectStatic'), $arguments);
        }
    }

    /**
     * Set HTML
     *
     * @param string $html HTML
     *
     * @return void
     */
    public function setHtml($html = '')
    {
        $this->domXpath = $this->getDomXpath($html);
    }

    /**
     * Select elements from $html using css $selector.
     *
     * @param string|DOMDocument $html      HTML string or DOMDocument
     * @param string             $selector  CSS selector
     * @param bool               $asDomList (false)
     *
     * @return mixed
     */
    protected static function selectStatic($html, $selector, $asDomList = false)
    {
        $domXpath = self::getDomXpath($html);
        $xpath = CssXpath::cssToXpath($selector);
        $elements = $domXpath->evaluate($xpath);
        if (!$elements) {
            return array();
        }
        return $asDomList
            ? $elements
            : self::elementsToArray($elements);
    }

    /**
     * Select elements using css $selector.
     *
     * When $asDomList is false (default):
     * matching elements will be return as an associative array containing
     *      name : element name
     *      attributes : attributes array
     *      innerHTML : innner HTML
     *
     * Otherwise regular DOMElement's will be returned.
     *
     * @param string $selector  css selector
     * @param bool   $asDomList (false)
     *
     * @return mixed
     */
    protected function selectNonStatic($selector, $asDomList = false)
    {
        $domXpath = $this->domXpath;
        $xpath = CssXpath::cssToXpath($selector);
        $elements = $domXpath->evaluate($xpath);
        if (!$elements) {
            return array();
        }
        return $asDomList
            ? $elements
            : self::elementsToArray($elements);
    }

    /**
     * Convert DOMNodeList to an array.
     *
     * @param DOMNodeList $elements elements
     *
     * @return array
     */
    protected static function elementsToArray(DOMNodeList $elements)
    {
        $array = array();
        for ($i = 0, $length = $elements->length; $i < $length; ++$i) {
            if ($elements->item($i)->nodeType === XML_ELEMENT_NODE) {
                $array[] = self::elementToArray($elements->item($i));
            }
        }
        return $array;
    }

    /**
     * Convert DOMElement to an array.
     *
     * @param DOMElement $element element
     *
     * @return array
     */
    protected static function elementToArray(DOMElement $element)
    {
        $array = array(
            'attributes' => array(),
            'innerHTML' => self::domInnerHtml($element),
            'name' => $element->nodeName,
        );
        foreach ($element->attributes as $key => $attr) {
            $array['attributes'][$key] = $attr->value;
        }
        return $array;
    }

    /**
     * Build inner html for given DOMElement
     *
     * @param DOMElement $element dom element
     *
     * @return string html
     */
    protected static function domInnerHtml(DOMElement $element)
    {
        $innerHTML = '';
        foreach ($element->childNodes as $child) {
            $innerHTML .= $element->ownerDocument->saveHTML($child);
        }
        /*
            saveHTML doesn't close "void" tags  :(
        */
        $voidTags = array('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr');
        $regEx = '#<(' . \implode('|', $voidTags) . ')(\b[^>]*)>#';
        $innerHTML = \preg_replace($regEx, '<\\1\\2 />', $innerHTML);
        return \trim($innerHTML);
    }

    /**
     * Return DOMXpath object
     *
     * @param string|DOMDocument $html HTML string or DOMDocument object
     *
     * @return DOUMXpath
     */
    protected static function getDomXpath($html)
    {
        if ($html instanceof DOMDocument) {
            return new DOMXpath($html);
        }
        \libxml_use_internal_errors(true);
        if (empty($html)) {
            $html = '<!-- empty document -->';
        }
        $dom = new DOMDocument();
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html); // seriously?
        foreach ($dom->childNodes as $node) {
            if ($node->nodeType === XML_PI_NODE) {
                $dom->removeChild($node); // remove xml encoding tag
                break;
            }
        }
        $dom->encoding = 'UTF-8';
        return new DOMXpath($dom);
    }
}