emory-libraries/eulxml

View on GitHub
eulxml/xpath/lexrules.py

Summary

Maintainability
A
0 mins
Test Coverage
# file eulxml/xpath/lexrules.py
#
#   Copyright 2010,2011 Emory University Libraries
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

"""XPath lexing rules.

To understand how this module works, it is valuable to have a strong
understanding of the `ply <http://www.dabeaz.com/ply/>` module.
"""

from __future__ import unicode_literals

operator_names = {
    'or': 'OR_OP',
    'and': 'AND_OP',
    'div': 'DIV_OP',
    'mod': 'MOD_OP',
}

tokens = [
        'PATH_SEP',
        'ABBREV_PATH_SEP',
        'ABBREV_STEP_SELF',
        'ABBREV_STEP_PARENT',
        'AXIS_SEP',
        'ABBREV_AXIS_AT',
        'OPEN_PAREN',
        'CLOSE_PAREN',
        'OPEN_BRACKET',
        'CLOSE_BRACKET',
        'UNION_OP',
        'EQUAL_OP',
        'REL_OP',
        'PLUS_OP',
        'MINUS_OP',
        'MULT_OP',
        'STAR_OP',
        'COMMA',
        'LITERAL',
        'FLOAT',
        'INTEGER',
        'NCNAME',
        'NODETYPE',
        'FUNCNAME',
        'AXISNAME',
        'COLON',
        'DOLLAR',
    ] + list(operator_names.values())

t_PATH_SEP = r'/'
t_ABBREV_PATH_SEP = r'//'
t_ABBREV_STEP_SELF = r'\.'
t_ABBREV_STEP_PARENT = r'\.\.'
t_AXIS_SEP = r'::'
t_ABBREV_AXIS_AT = r'@'
t_OPEN_PAREN = r'\('
t_CLOSE_PAREN = r'\)'
t_OPEN_BRACKET = r'\['
t_CLOSE_BRACKET = r'\]'
t_UNION_OP = r'\|'
t_EQUAL_OP = r'!?='
t_REL_OP = r'[<>]=?'
t_PLUS_OP = r'\+'
t_MINUS_OP = r'-'
t_COMMA = r','
t_COLON = r':'
t_DOLLAR = r'\$'
t_STAR_OP = r'\*'

t_ignore = ' \t\r\n'

# NOTE: some versions of python cannot compile regular expressions that
# contain unicode characters above U+FFFF, which are allowable in NCNames.
# These characters can be used in Python 2.6.4, but can NOT be used in 2.6.2
# (status in 2.6.3 is unknown).  The code below accounts for that and excludes
# the higher character range if Python can't handle it.

# Monster regex derived from:
#  http://www.w3.org/TR/REC-xml/#NT-NameStartChar
#  http://www.w3.org/TR/REC-xml/#NT-NameChar
# EXCEPT:
# Technically those productions allow ':'. NCName, on the other hand:
#  http://www.w3.org/TR/REC-xml-names/#NT-NCName
# explicitly excludes those names that have ':'. We implement this by
# simply removing ':' from our regexes.

# NameStartChar regex without characters about U+FFFF
NameStartChar = r'[A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|' + \
    r'[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|' + \
    r'[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]'
# complete NameStartChar regex
Full_NameStartChar = r'(' + NameStartChar + r'|[\U00010000-\U000EFFFF]' + r')'
# additional characters allowed in NCNames after the first character
NameChar_extras = r'[-.0-9\xb7\u0300-\u036f\u203f-\u2040]'

try:
    import re
    # test whether or not re can compile unicode characters above U+FFFF
    re.compile(r'[\U00010000-\U00010001]')
    # if that worked, then use the full ncname regex
    NameStartChar = Full_NameStartChar
except:
    # if compilation failed, leave NameStartChar regex as is, which does not
    # include the unicode character ranges above U+FFFF
    pass

NCNAME_REGEX = r'(' + NameStartChar + r')(' + \
                      NameStartChar + r'|' + NameChar_extras + r')*'

NODE_TYPES = set(['comment', 'text', 'processing-instruction', 'node'])

t_NCNAME = NCNAME_REGEX

def t_LITERAL(t):
    r""""[^"]*"|'[^']*'"""
    t.value = t.value[1:-1]
    return t

def t_FLOAT(t):
    r'\d+\.\d*|\.\d+'
    t.value = float(t.value)
    return t

def t_INTEGER(t):
    r'\d+'
    t.value = int(t.value)
    return t

def t_error(t):
    raise TypeError("Unknown text '%s'" % (t.value,))