authlib/common/urls.py
"""
authlib.util.urls
~~~~~~~~~~~~~~~~~
Wrapper functions for URL encoding and decoding.
"""
import re
from urllib.parse import quote as _quote
from urllib.parse import unquote as _unquote
from urllib.parse import urlencode as _urlencode
import urllib.parse as urlparse
from .encoding import to_unicode, to_bytes
always_safe = (
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789_.-'
)
urlencoded = set(always_safe) | set('=&;:%+~,*@!()/?')
INVALID_HEX_PATTERN = re.compile(r'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]')
def url_encode(params):
encoded = []
for k, v in params:
encoded.append((to_bytes(k), to_bytes(v)))
return to_unicode(_urlencode(encoded))
def url_decode(query):
"""Decode a query string in x-www-form-urlencoded format into a sequence
of two-element tuples.
Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce
correct formatting of the query string by validation. If validation fails
a ValueError will be raised. urllib.parse_qsl will only raise errors if
any of name-value pairs omits the equals sign.
"""
# Check if query contains invalid characters
if query and not set(query) <= urlencoded:
error = ("Error trying to decode a non urlencoded string. "
"Found invalid characters: %s "
"in the string: '%s'. "
"Please ensure the request/response body is "
"x-www-form-urlencoded.")
raise ValueError(error % (set(query) - urlencoded, query))
# Check for correctly hex encoded values using a regular expression
# All encoded values begin with % followed by two hex characters
# correct = %00, %A0, %0A, %FF
# invalid = %G0, %5H, %PO
if INVALID_HEX_PATTERN.search(query):
raise ValueError('Invalid hex encoding in query string.')
# We encode to utf-8 prior to parsing because parse_qsl behaves
# differently on unicode input in python 2 and 3.
# Python 2.7
# >>> urlparse.parse_qsl(u'%E5%95%A6%E5%95%A6')
# u'\xe5\x95\xa6\xe5\x95\xa6'
# Python 2.7, non unicode input gives the same
# >>> urlparse.parse_qsl('%E5%95%A6%E5%95%A6')
# '\xe5\x95\xa6\xe5\x95\xa6'
# but now we can decode it to unicode
# >>> urlparse.parse_qsl('%E5%95%A6%E5%95%A6').decode('utf-8')
# u'\u5566\u5566'
# Python 3.3 however
# >>> urllib.parse.parse_qsl(u'%E5%95%A6%E5%95%A6')
# u'\u5566\u5566'
# We want to allow queries such as "c2" whereas urlparse.parse_qsl
# with the strict_parsing flag will not.
params = urlparse.parse_qsl(query, keep_blank_values=True)
# unicode all the things
decoded = []
for k, v in params:
decoded.append((to_unicode(k), to_unicode(v)))
return decoded
def add_params_to_qs(query, params):
"""Extend a query with a list of two-tuples."""
if isinstance(params, dict):
params = params.items()
qs = urlparse.parse_qsl(query, keep_blank_values=True)
qs.extend(params)
return url_encode(qs)
def add_params_to_uri(uri, params, fragment=False):
"""Add a list of two-tuples to the uri query components."""
sch, net, path, par, query, fra = urlparse.urlparse(uri)
if fragment:
fra = add_params_to_qs(fra, params)
else:
query = add_params_to_qs(query, params)
return urlparse.urlunparse((sch, net, path, par, query, fra))
def quote(s, safe=b'/'):
return to_unicode(_quote(to_bytes(s), safe))
def unquote(s):
return to_unicode(_unquote(s))
def quote_url(s):
return quote(s, b'~@#$&()*!+=:;,.?/\'')
def extract_params(raw):
"""Extract parameters and return them as a list of 2-tuples.
Will successfully extract parameters from urlencoded query strings,
dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an
empty list of parameters. Any other input will result in a return
value of None.
"""
if isinstance(raw, (list, tuple)):
try:
raw = dict(raw)
except (TypeError, ValueError):
return None
if isinstance(raw, dict):
params = []
for k, v in raw.items():
params.append((to_unicode(k), to_unicode(v)))
return params
if not raw:
return None
try:
return url_decode(raw)
except ValueError:
return None
def is_valid_url(url):
parsed = urlparse.urlparse(url)
return parsed.scheme and parsed.hostname