# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: # Copyright 2014-2015 Florian Bruhin (The Compiler) # # This file is part of qutebrowser. # # qutebrowser is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # qutebrowser is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with qutebrowser. If not, see . """pyPEG parsing for the RFC 6266 (Content-Disposition) header.""" import collections import urllib.parse import string import re import pypeg2 as peg from qutebrowser.utils import utils class UniqueNamespace(peg.Namespace): """A pyPEG2 namespace which prevents setting a value twice.""" def __setitem__(self, key, value): if key in self: raise DuplicateParamError(key) super().__setitem__(key, value) # RFC 2616 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) # RFC 5987 attr_chars_nonalnum = '!#$&+-.^_`|~' attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum # RFC 5987 gives this alternative construction of the token character class token_chars = attr_chars + "*'%" # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 # token was redefined from attr_chars to avoid using AnyBut, # which might include non-ascii octets. token_re = '[{}]+'.format(re.escape(token_chars)) class Token(str): """A token (RFC 2616, Section 2.2).""" grammar = re.compile(token_re) # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text # and qdtext; however it also mentions folding that whitespace into # a single SP (which isn't in CTL) before interpretation. # Assume the caller already that folding when parsing headers. # NOTE: qdtext also allows non-ascii, which we choose to parse # as ISO-8859-1; rejecting it entirely would also be permitted. # Some broken browsers attempt encoding-sniffing, which is broken # because the spec only allows iso, and because encoding-sniffing # can mangle valid values. # Everything else in this grammar (including RFC 5987 ext values) # is in an ascii-safe encoding. qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) quoted_pair_re = r'\\[{}]'.format(re.escape( ''.join(chr(i) for i in range(128)))) class QuotedString(str): """A quoted string (RFC 2616, Section 2.2).""" grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) def __str__(self): s = super().__str__() s = s[1:-1] # remove quotes s = re.sub(r'\\(.)', r'\1', s) # drop backslashes return s class Value(str): """A value. (RFC 2616, Section 3.6).""" grammar = [re.compile(token_re), QuotedString] class Charset(str): """A charset (RFC5987, Section 3.2.1).""" # Other charsets are forbidden, the spec reserves them # for future evolutions. grammar = re.compile('UTF-8|ISO-8859-1', re.I) class Language(str): """A language-tag (RFC 5646, Section 2.1). FIXME: This grammar is not 100% correct yet. https://github.com/The-Compiler/qutebrowser/issues/105 """ grammar = re.compile('[A-Za-z0-9-]+') attr_char_re = '[{}]'.format(re.escape(attr_chars)) hex_digit_re = '%[' + string.hexdigits + ']{2}' class ValueChars(str): """A value of an attribute. FIXME: Can we merge this with Value? https://github.com/The-Compiler/qutebrowser/issues/105 """ grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) class ExtValue(peg.List): """An ext-value of an attribute (RFC 5987, Section 3.2).""" grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", ValueChars) class ExtToken(peg.Symbol): """A token introducing an extended value (RFC 6266, Section 4.1).""" regex = re.compile(token_re + r'\*') def __str__(self): return super().__str__().lower() class NoExtToken(peg.Symbol): """A token introducing a normal value (RFC 6266, Section 4.1).""" regex = re.compile(token_re + r'(?