diff --git a/qutebrowser/utils/rfc6266.py b/qutebrowser/utils/rfc6266.py index c3199fe1b..fece9f8b6 100644 --- a/qutebrowser/utils/rfc6266.py +++ b/qutebrowser/utils/rfc6266.py @@ -7,19 +7,199 @@ import string import re +class UniqueNamespace(peg.Namespace): + + """A pyPEG2 namespace which prevents setting a value twice.""" + + def __setitem__(self, key, value): + if key in self: + raise DuplicateParamError(key) + super().__setitem__(key, value) + + +# RFC 2616 +separator_chars = "()<>@,;:\\\"/[]?={} \t" +ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) +nontoken_chars = separator_chars + ctl_chars + + +# RFC 5987 +attr_chars_nonalnum = '!#$&+-.^_`|~' +attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum + + +# RFC 5987 gives this alternative construction of the token character class +token_chars = attr_chars + "*'%" + + +# Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 +# token was redefined from attr_chars to avoid using AnyBut, +# which might include non-ascii octets. +token_re = '[{}]+'.format(re.escape(token_chars)) + +class Token(str): + + """A token (RFC 2616, Section 2.2).""" + + grammar = re.compile(token_re) + + +# RFC 2616 says some linear whitespace (LWS) is in fact allowed in text +# and qdtext; however it also mentions folding that whitespace into +# a single SP (which isn't in CTL) before interpretation. +# Assume the caller already that folding when parsing headers. + +# NOTE: qdtext also allows non-ascii, which we choose to parse +# as ISO-8859-1; rejecting it entirely would also be permitted. +# Some broken browsers attempt encoding-sniffing, which is broken +# because the spec only allows iso, and because encoding-sniffing +# can mangle valid values. +# Everything else in this grammar (including RFC 5987 ext values) +# is in an ascii-safe encoding. + +qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) +quoted_pair_re = r'\\[{}]'.format(re.escape( + ''.join(chr(i) for i in range(128)))) + +class QuotedString(str): + + """A quoted string (RFC 2616, Section 2.2).""" + + grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) + + def __str__(self): + s = super().__str__() + s = s[1:-1] # remove quotes + s = re.sub(r'\\(.)', r'\1', s) # drop backslashes + return s + + +class Value(str): + + """A value. (RFC 2616, Section 3.6).""" + + grammar = [re.compile(token_re), QuotedString] + + +class Charset(str): + + """A charset (RFC5987, Section 3.2.1).""" + + # Other charsets are forbidden, the spec reserves them + # for future evolutions. + grammar = re.compile('UTF-8|ISO-8859-1', re.I) + + +class Language(str): + + """A language-tag (RFC 5646, Section 2.1). + + FIXME: This grammar is not 100% correct yet. + """ + grammar = re.compile('[A-Za-z0-9-]+') + + +attr_char_re = '[{}]'.format(re.escape(attr_chars)) +hex_digit_re = '%[' + string.hexdigits + ']{2}' + + +class ValueChars(str): + + """A value of an attribute. + + FIXME: Can we merge this with Value? + """ + + grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) + + +class ExtValue(peg.List): + + """An ext-value of an attribute (RFC 5987, Section 3.2).""" + + grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", + ValueChars) + +class ExtToken(peg.Symbol): + + """A token introducing an extended value (RFC 6266, Section 4.1).""" + + regex = re.compile(token_re + r'\*') + + def __str__(self): + return super().__str__().lower() + + +class NoExtToken(peg.Symbol): + + """A token introducing a normal value (RFC 6266, Section 4.1).""" + + regex = re.compile(token_re + r'(?@,;:\\\"/[]?={} \t" -ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) -nontoken_chars = separator_chars + ctl_chars - -# RFC 5987 -attr_chars_nonalnum = '!#$&+-.^_`|~' -attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum - -# RFC 5987 gives this alternative construction of the token character class -token_chars = attr_chars + "*'%" - - -# Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 -# token was redefined from attr_chars to avoid using AnyBut, -# which might include non-ascii octets. -token_re = '[{}]+'.format(re.escape(token_chars)) -class Token(str): - grammar = re.compile(token_re) - - -# RFC 2616 says some linear whitespace (LWS) is in fact allowed in text -# and qdtext; however it also mentions folding that whitespace into -# a single SP (which isn't in CTL) before interpretation. -# Assume the caller already that folding when parsing headers. - -# NOTE: qdtext also allows non-ascii, which we choose to parse -# as ISO-8859-1; rejecting it entirely would also be permitted. -# Some broken browsers attempt encoding-sniffing, which is broken -# because the spec only allows iso, and because encoding-sniffing -# can mangle valid values. -# Everything else in this grammar (including RFC 5987 ext values) -# is in an ascii-safe encoding. -# Because of this, this is the only character class to use AnyBut, -# and all the others are defined with Any. - -qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) -quoted_pair_re = r'\\[{}]'.format(re.escape(''.join(chr(i) for i in range(128)))) - -class QuotedString(str): - grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) - - def __str__(self): - s = super().__str__() - s = s[1:-1] - s = re.sub(r'\\(.)', r'\1', s) - return s - - -class Value(str): - grammar = [re.compile(token_re), QuotedString] - -# Other charsets are forbidden, the spec reserves them -# for future evolutions. -class Charset(str): - grammar = re.compile('UTF-8|ISO-8859-1', re.I) - -class Language(str): - # XXX See RFC 5646 for the correct definition - grammar = re.compile('[A-Za-z0-9-]+') - -attr_char_re = '[{}]'.format(re.escape(attr_chars)) -hex_digit_re = '%[' + string.hexdigits + ']{2}' - -class ValueChars(str): - grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) - -class ExtValue(peg.List): - grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", ValueChars) - -class ExtToken(peg.Symbol): - regex = re.compile(token_re + r'\*') - - def __str__(self): - return super().__str__().lower() - -class NoExtToken(peg.Symbol): - regex = re.compile(token_re + r'(?