sparklemotion/nokogiri

View on GitHub
lib/nokogiri/css/xpath_visitor.rb

Summary

Maintainability
D
1 day
Test Coverage
A
93%
# coding: utf-8
# frozen_string_literal: true

module Nokogiri
  module CSS
    # When translating CSS selectors to XPath queries with Nokogiri::CSS.xpath_for, the XPathVisitor
    # class allows for changing some of the behaviors related to builtin xpath functions and quirks
    # of HTML5.
    class XPathVisitor
      WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc:

      # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions.
      module BuiltinsConfig
        # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is
        # the default when calling Nokogiri::CSS.xpath_for directly.
        NEVER = :never

        # Always use Nokogiri builtin functions whenever possible. This is probably only useful for testing.
        ALWAYS = :always

        # Only use Nokogiri builtin functions when they will be faster than vanilla XPath. This is
        # the behavior chosen when searching for CSS selectors on a Nokogiri document, fragment, or
        # node.
        OPTIMAL = :optimal

        # :nodoc: array of values for validation
        VALUES = [NEVER, ALWAYS, OPTIMAL]
      end

      # Enum to direct XPathVisitor when to tweak the XPath query to suit the nature of the document
      # being searched. Note that searches for CSS selectors from a Nokogiri document, fragment, or
      # node will choose the correct option automatically.
      module DoctypeConfig
        # The document being searched is an XML document. This is the default.
        XML = :xml

        # The document being searched is an HTML4 document.
        HTML4 = :html4

        # The document being searched is an HTML5 document.
        HTML5 = :html5

        # :nodoc: array of values for validation
        VALUES = [XML, HTML4, HTML5]
      end

      # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
      attr_reader :builtins

      # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
      attr_reader :doctype

      # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
      attr_reader :prefix

      # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
      attr_reader :namespaces

      # :call-seq:
      #   new() → XPathVisitor
      #   new(builtins:, doctype:) → XPathVisitor
      #
      # [Parameters]
      # - +builtins:+ (BuiltinsConfig) Determine when to use Nokogiri's built-in xpath functions for performance improvements.
      # - +doctype:+ (DoctypeConfig) Make document-type-specific accommodations for CSS queries.
      #
      # [Returns] XPathVisitor
      #
      def initialize(
        builtins: BuiltinsConfig::NEVER,
        doctype: DoctypeConfig::XML,
        prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
        namespaces: nil
      )
        unless BuiltinsConfig::VALUES.include?(builtins)
          raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
        end
        unless DoctypeConfig::VALUES.include?(doctype)
          raise(ArgumentError, "Invalid values #{doctype.inspect} for doctype: keyword parameter")
        end

        @builtins = builtins
        @doctype = doctype
        @prefix = prefix
        @namespaces = namespaces
      end

      # :call-seq: config() → Hash
      #
      # [Returns]
      #   a Hash representing the configuration of the XPathVisitor, suitable for use as
      #   part of the CSS cache key.
      def config
        { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
      end

      # :stopdoc:
      def visit_function(node)
        msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}"
        return send(msg, node) if respond_to?(msg)

        case node.value.first
        when /^text\(/
          "child::text()"
        when /^self\(/
          "self::#{node.value[1]}"
        when /^eq\(/
          "position()=#{node.value[1]}"
        when /^(nth|nth-of-type)\(/
          if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
            nth(node.value[1])
          else
            "position()=#{node.value[1]}"
          end
        when /^nth-child\(/
          if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
            nth(node.value[1], child: true)
          else
            "count(preceding-sibling::*)=#{node.value[1].to_i - 1}"
          end
        when /^nth-last-of-type\(/
          if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
            nth(node.value[1], last: true)
          else
            index = node.value[1].to_i - 1
            index == 0 ? "position()=last()" : "position()=last()-#{index}"
          end
        when /^nth-last-child\(/
          if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH)
            nth(node.value[1], last: true, child: true)
          else
            "count(following-sibling::*)=#{node.value[1].to_i - 1}"
          end
        when /^(first|first-of-type)\(/
          "position()=1"
        when /^(last|last-of-type)\(/
          "position()=last()"
        when /^contains\(/
          "contains(.,#{node.value[1]})"
        when /^gt\(/
          "position()>#{node.value[1]}"
        when /^only-child\(/
          "last()=1"
        when /^comment\(/
          "comment()"
        when /^has\(/
          is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
          ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
        else
          validate_xpath_function_name(node.value.first)

          # xpath function call, let's marshal those arguments
          args = ["."]
          args += node.value[1..-1].map do |n|
            n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n
          end
          "nokogiri:#{node.value.first}#{args.join(",")})"
        end
      end

      def visit_not(node)
        child = node.value.first
        if :ELEMENT_NAME == child.type
          "not(self::#{child.accept(self)})"
        else
          "not(#{child.accept(self)})"
        end
      end

      def visit_id(node)
        node.value.first =~ /^#(.*)$/
        "@id='#{Regexp.last_match(1)}'"
      end

      def visit_attribute_condition(node)
        attribute = node.value.first.accept(self)
        return attribute if node.value.length == 1

        value = node.value.last
        value = "'#{value}'" unless /^['"]/.match?(value)

        # quoted values - see test_attribute_value_with_quotes in test/css/test_parser.rb
        if (value[0] == value[-1]) && %q{"'}.include?(value[0])
          str_value = value[1..-2]
          if str_value.include?(value[0])
            value = 'concat("' + str_value.split('"', -1).join(%q{",'"',"}) + '","")'
          end
        end

        case node.value[1]
        when :equal
          attribute + "=" + value.to_s
        when :not_equal
          attribute + "!=" + value.to_s
        when :substring_match
          "contains(#{attribute},#{value})"
        when :prefix_match
          "starts-with(#{attribute},#{value})"
        when :dash_match
          "#{attribute}=#{value} or starts-with(#{attribute},concat(#{value},'-'))"
        when :includes
          value = value[1..-2] # strip quotes
          css_class(attribute, value)
        when :suffix_match
          "substring(#{attribute},string-length(#{attribute})-string-length(#{value})+1,string-length(#{value}))=#{value}"
        else
          attribute + " #{node.value[1]} " + value.to_s
        end
      end

      def visit_pseudo_class(node)
        if node.value.first.is_a?(Nokogiri::CSS::Node) && (node.value.first.type == :FUNCTION)
          node.value.first.accept(self)
        else
          msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, "")}"
          return send(msg, node) if respond_to?(msg)

          case node.value.first
          when "first" then "position()=1"
          when "first-child" then "count(preceding-sibling::*)=0"
          when "last" then "position()=last()"
          when "last-child" then "count(following-sibling::*)=0"
          when "first-of-type" then "position()=1"
          when "last-of-type" then "position()=last()"
          when "only-child" then "count(preceding-sibling::*)=0 and count(following-sibling::*)=0"
          when "only-of-type" then "last()=1"
          when "empty" then "not(node())"
          when "parent" then "node()"
          when "root" then "not(parent::*)"
          else
            validate_xpath_function_name(node.value.first)
            "nokogiri:#{node.value.first}(.)"
          end
        end
      end

      def visit_class_condition(node)
        css_class("@class", node.value.first)
      end

      def visit_combinator(node)
        if is_of_type_pseudo_class?(node.value.last)
          "#{node.value.first&.accept(self)}][#{node.value.last.accept(self)}"
        else
          "#{node.value.first&.accept(self)} and #{node.value.last.accept(self)}"
        end
      end

      {
        "direct_adjacent_selector" => "/following-sibling::*[1]/self::",
        "following_selector" => "/following-sibling::",
        "descendant_selector" => "//",
        "child_selector" => "/",
      }.each do |k, v|
        class_eval <<~RUBY, __FILE__, __LINE__ + 1
          def visit_#{k} node
            "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
          end
        RUBY
      end

      def visit_conditional_selector(node)
        node.value.first.accept(self) + "[" +
          node.value.last.accept(self) + "]"
      end

      def visit_element_name(node)
        if @doctype == DoctypeConfig::HTML5 && html5_element_name_needs_namespace_handling(node)
          # HTML5 has namespaces that should be ignored in CSS queries
          # https://github.com/sparklemotion/nokogiri/issues/2376
          if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
            if WILDCARD_NAMESPACES
              "*:#{node.value.first}"
            else
              "*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
            end
          else
            "*[local-name()='#{node.value.first}']"
          end
        elsif node.value.length == 2 # has a namespace prefix
          if node.value.first.nil? # namespace prefix is empty
            node.value.last
          else
            node.value.join(":")
          end
        elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared
          "xmlns:#{node.value.first}"
        else
          node.value.first
        end
      end

      def visit_attrib_name(node)
        "@#{node.value.first}"
      end

      def accept(node)
        node.accept(self)
      end

      private

      def validate_xpath_function_name(name)
        if name.start_with?("-")
          raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
        end
      end

      def html5_element_name_needs_namespace_handling(node)
        # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
        node.value.length == 1 &&
          # if this is the wildcard selector "*", use it as normal
          node.value.first != "*"
      end

      def nth(node, options = {})
        unless node.value.size == 4
          raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}")
        end

        a, b = read_a_and_positive_b(node.value)
        position = if options[:child]
          options[:last] ? "(count(following-sibling::*)+1)" : "(count(preceding-sibling::*)+1)"
        else
          options[:last] ? "(last()-position()+1)" : "position()"
        end

        if b.zero?
          "(#{position} mod #{a})=0"
        else
          compare = a < 0 ? "<=" : ">="
          if a.abs == 1
            "#{position}#{compare}#{b}"
          else
            "(#{position}#{compare}#{b}) and (((#{position}-#{b}) mod #{a.abs})=0)"
          end
        end
      end

      def read_a_and_positive_b(values)
        op = values[2].strip
        if op == "+"
          a = values[0].to_i
          b = values[3].to_i
        elsif op == "-"
          a = values[0].to_i
          b = a - (values[3].to_i % a)
        else
          raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}"
        end
        [a, b]
      end

      def is_of_type_pseudo_class?(node) # rubocop:disable Naming/PredicateName
        if node.type == :PSEUDO_CLASS
          if node.value[0].is_a?(Nokogiri::CSS::Node) && (node.value[0].type == :FUNCTION)
            node.value[0].value[0]
          else
            node.value[0]
          end =~ /(nth|first|last|only)-of-type(\()?/
        end
      end

      def css_class(hay, needle)
        if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
          # use the builtin implementation
          "nokogiri-builtin:css-class(#{hay},'#{needle}')"
        else
          # use only ordinary xpath functions
          "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')"
        end
      end
    end
  end
end