bkdotcom/CssXpath

View on GitHub
src/CssXpath.php

Summary

Maintainability
A
3 hrs
Test Coverage
A
100%
<?php

/**
 * This file is part of CssXpath
 *
 * @package   CssXPath
 * @author    Brad Kent <bkfake-github@yahoo.com>
 * @license   http://opensource.org/licenses/MIT MIT
 * @copyright 2018-2023 Brad Kent
 * @version   1.0
 *
 * @link http://www.github.com/bkdotcom/CssXpath
 */

namespace bdk\CssXpath;

/**
 * Convert CSS selector to xpath selector
 *
 * @see https://developer.mozilla.org/en-US/docs/Web/CSS/Reference#Selectors
 * @see https://github.com/rdsubhas/css-xpath-converter/blob/gh-pages/app.js
 * @see https://msdn.microsoft.com/en-us/library/ms256086(v=vs.110).aspx
 * @see https://en.wikibooks.org/wiki/XPath/CSS_Equivalents
 * @see http://ricostacruz.com/cheatsheets/xpath.html
 */
class CssXpath
{
    private static $cache = array();

    /** @var string[] attribute && :contains() substitutions */
    private static $strings = array();
    private static $clearStrings = true;

    /**
     * css -> xpath
     *
     * @param string $selector CSS selector
     *
     * @return string
     */
    public static function cssToXpath($selector)
    {
        if (isset(self::$cache[$selector])) {
            return self::$cache[$selector];
        }

        if (self::$clearStrings) {
            self::$strings = array();
        }

        $xpath = self::processRegexs($selector);
        $xpath = \preg_match('/^\//', $xpath)
            ? $xpath
            : '//' . $xpath;
        $xpath = \preg_replace('#/{4}#', '', $xpath);
        self::$cache[$selector] = $xpath;
        return $xpath;
    }

    /**
     * Handle attributes reges
     *
     * @param array $matches preg_match maches
     *
     * @return string
     */
    protected static function callbackAttribs($matches)
    {
        // Attribute selectors
        $return = '[@' . $matches[2] . ']';
        $regex = '/^(?<name>.*?)(?<comparison>=|~=|\|=|\^=|\$=|\*=|!=)[\'"]?(?<value>.*?)[\'"]?$/';
        $matchesInner = array();
        if (\preg_match($regex, $matches[2], $matchesInner)) {
            $map = array(
                '!=' => '[@%s!="%s"]',
                '$=' => '[ends-with(@%s, "%s")]',
                '*=' => '[contains(@%s, "%s")]',
                '=' => '[@%s="%s"]',
                '^=' => '[starts-with(@%s, "%s")]',
                '|=' => '[starts-with(concat(@%s, "-"), "%s-")]',
                '~=' => '[contains(concat(" ", @%s, " "), " %s ")]',
            );
            $return = \sprintf($map[$matchesInner['comparison']], $matchesInner['name'], $matchesInner['value']);
        }
        self::$strings[] = ($matches[1] ? '*' : '') . $return;
        return ($matches[1] ? ' ' : '') . '[{' . (\count(self::$strings) - 1) . '}]';
    }

    /**
     * Itterate over regular expressions transforming css selector
     *
     * @param string $cssSelector CSS selector
     *
     * @return string
     */
    private static function processRegexs($cssSelector)
    {
        $regexs = self::regexs();
        $xpath = ' ' . $cssSelector;
        foreach ($regexs as $regCallback) {
            $limit = isset($regCallback[2])
                ? $regCallback[2]
                : -1;
            if ($limit < 0) {
                $xpath = \preg_replace_callback($regCallback[0], $regCallback[1], $xpath);
                continue;
            }
            $count = 0;
            do {
                $xpath = \preg_replace_callback($regCallback[0], $regCallback[1], $xpath, $limit, $count);
            } while ($count > 0);
        }
        return $xpath;
    }

    /**
     * Return regular expressions to process css selector
     *
     * @return array
     */
    private static function regexs()
    {
        /*
            The order in which items are replaced is IMPORTANT!
        */
        return array(
            /*
                First handle attributes and :contains()
                these may contain "," " ", " > ", and other "special" strings
            */
            array('/([\s]?)\[(.*?)\]/', array(\get_called_class(), 'callbackAttribs')),
            // :contains(foo)  // a jquery thing
            array('/:contains\((.*?)\)/', static function ($matches) {
                self::$strings[] = '[contains(text(), "' . $matches[1] . '")]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/([\s]?):not\((.*?)\)/', static function ($matches) {
                // this currently works for simple :not(.classname)
                // unsure of other selectors
                self::$clearStrings = false;
                $xpathNot = self::cssToXpath($matches[2]);
                self::$clearStrings = true;
                $xpathNot = \preg_replace('#^//\*\[(.+)\]#', '$1', $xpathNot);
                self::$strings[] = ($matches[1] ? '*' : '') . '[not(' . $xpathNot . ')]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/([\s]?):has\((.*?)\)/', static function ($matches) {
                self::$strings[] = '[count(' . self::cssToXpath($matches[2]) . ') > 0]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            // All blocks of 2 or more spaces
            array('/\s{2,}/', static function () {
                return ' ';
            }),
            // additional selectors (comma seperated)
            array('/\s*,\s*/', static function () {
                return '|//';
            }),
            // input pseudo selectors
            array(
                '/:(text|password|checkbox|radio|reset|file|hidden|image|datetime|datetime-local|date|month|time|week|number|range|email|url|search|tel|color)/',
                static function ($matches) {
                    return '[@type="' . $matches[1] . '"]';
                },
            ),
            array('/([\s]?):button/', static function ($matches) {
                // button or input[@type="button"]
                self::$strings[] = ($matches[1] ? '*' : '') . '[self::button or @type="button"]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/([\s]?):input/', static function ($matches) {
                self::$strings[] = ($matches[1] ? '*' : '') . '[self::input or self::select or self::textarea or self::button]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/([\s]?):submit/', static function ($matches) {
                // input[type="submit"]   button[@type="submit"]  button[not(@type)]
                self::$strings[] = ($matches[1] ? '*' : '') . '[@type="submit" or (self::button and not(@type))]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/:header/', static function () {
                self::$strings[] = '*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6]';
                return '[{' . (\count(self::$strings) - 1) . '}]';
            }),
            array('/:(autofocus|checked|disabled|required|selected)/', static function ($matches) {
                return '[@' . $matches[1] . ']';
            }),
            array('/:autocomplete/', static function () {
                return '[@autocomplete="on"]';
            }),
            // :nth-child(n)
            array('/(\S*):nth-child\((\d+)\)/', static function ($matches) {
                return ($matches[1] ? $matches[1] : '*')
                    . '[' . $matches[2] . ']';
            }),
            // :nth-last-child(n)
            array('/(\S*):nth-last-child\((\d+)\)/', static function ($matches) {
                return ($matches[1] ? $matches[1] : '*')
                    . '[position()=(last()-(' . $matches[2] . '-1))]';
            }),
            // :last-child
            array('/(\S*):last-child/', static function ($matches) {
                return ($matches[1] ? $matches[1] : '*')
                    . '[last()]';
            }),
            // :first-child
            array('/(\S*):first-child/', static function ($matches) {
                return ($matches[1] ? $matches[1] : '*')
                    . '[1]';
            }),
            // Adjacent "sibling" selectors
            array('/\s*\+\s*([^\s]+)/', static function ($matches) {
                return '/following-sibling::' . $matches[1] . '[1]';
            }),
            // General "sibling" selectors
            array('/\s*~\s*([^\s]+)/', static function ($matches) {
                return '/following-sibling::' . $matches[1];
            }),
            // "child" selectors
            array('/\s*>\s*/', static function () {
                return '/';
            }),
            // Remaining Spaces
            array('/\s/', static function () {
                return '//';
            }),
            // #id
            array('/([a-z0-9\]]?)#([a-z][-a-z0-9_]+)/i', static function ($matches) {
                return $matches[1]
                    . ($matches[1] ? '' : '*')
                    . '[@id="' . $matches[2] . '"]';
            }),
            // .className
            // tricky.  without limiting the replacement, the first group will be empty for the 2nd class
            // test case:
            //    foo.classa.classb
            array('/([a-z0-9\]]?)\.(-?[_a-z]+[_a-z0-9-]*)/i', static function ($matches) {
                return $matches[1]
                    . ($matches[1] ? '' : '*')
                    . '[contains(concat(" ", normalize-space(@class), " "), " ' . $matches[2] . ' ")]';
            }, 1),
            array('/:scope/', static function () {
                return '//';
            }),
            // Restore strings
            array('/\[\{(\d+)\}\]/', static function ($matches) {
                return self::$strings[$matches[1]];
            }),
        );
    }
}