From e15999036319ad1c610733d8e7eab7e44c024339 Mon Sep 17 00:00:00 2001 From: Florian Bruhin Date: Wed, 13 Aug 2014 20:47:54 +0200 Subject: [PATCH] utils.rfc6266: Initial implementation. --- .../utils/http/test_content_disposition.py | 2 +- qutebrowser/utils/http.py | 2 +- qutebrowser/utils/rfc6266.py | 285 ++++++++++++++++++ 3 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 qutebrowser/utils/rfc6266.py diff --git a/qutebrowser/test/utils/http/test_content_disposition.py b/qutebrowser/test/utils/http/test_content_disposition.py index 0652360b2..48fd9cc65 100644 --- a/qutebrowser/test/utils/http/test_content_disposition.py +++ b/qutebrowser/test/utils/http/test_content_disposition.py @@ -152,7 +152,7 @@ class AttachmentTests(AttachmentTestCase): """'ATTACHMENT' only UA should offer to download the resource. - """ + """ self._check_unnamed('ATTACHMENT') def test_attwithasciifilename(self): diff --git a/qutebrowser/utils/http.py b/qutebrowser/utils/http.py index 91a6c9173..47ad87fd6 100644 --- a/qutebrowser/utils/http.py +++ b/qutebrowser/utils/http.py @@ -21,7 +21,7 @@ import os.path -import rfc6266 +import qutebrowser.utils.rfc6266 as rfc6266 from qutebrowser.utils.log import misc as logger from PyQt5.QtNetwork import QNetworkRequest diff --git a/qutebrowser/utils/rfc6266.py b/qutebrowser/utils/rfc6266.py new file mode 100644 index 000000000..75aeb5e32 --- /dev/null +++ b/qutebrowser/utils/rfc6266.py @@ -0,0 +1,285 @@ +import pypeg2 as peg + +import os.path +from collections import namedtuple +import urllib.parse +import string +import re + + +LangTagged = namedtuple('LangTagged', 'string langtag') + + +class ContentDisposition: + """ + Records various indications and hints about content disposition. + + These can be used to know if a file should be downloaded or + displayed directly, and to hint what filename it should have + in the download case. + """ + + def __init__(self, disposition='inline', assocs=None, location=None): + """This constructor is used internally after parsing the header. + + Instances should generally be created from a factory + function, such as parse_headers and its variants. + """ + if len(disposition) != 1: + self.disposition = 'inline' + else: + self.disposition = disposition[0] + self.location = location + if assocs is None: + self.assocs = {} + else: + # XXX Check that parameters aren't repeated + self.assocs = assocs + + @property + def filename_unsafe(self): + """The filename from the Content-Disposition header. + + If a location was passed at instanciation, the basename + from that may be used as a fallback. Otherwise, this may + be the None value. + + On safety: + This property records the intent of the sender. + + You shouldn't use this sender-controlled value as a filesystem + path, it can be insecure. Serving files with this filename can be + dangerous as well, due to a certain browser using the part after the + dot for mime-sniffing. + Saving it to a database is fine by itself though. + """ + + if 'filename*' in self.assocs: + val = self.assocs['filename*'] + assert isinstance(val, ExtDispositionParm) + return parse_ext_value(val[0]).string + elif 'filename' in self.assocs: + # XXX Reject non-ascii (parsed via qdtext) here? + return self.assocs['filename'] + elif self.location is not None: + return os.path.basename(self.location_path.rstrip('/')) + + @property + def is_inline(self): + """If this property is true, the file should be handled inline. + + Otherwise, and unless your application supports other dispositions + than the standard inline and attachment, it should be handled + as an attachment. + """ + + return self.disposition.lower() == 'inline' + + def __repr__(self): + return 'ContentDisposition(%r, %r, %r)' % ( + self.disposition, self.assocs, self.location) + + + + +def percent_decode(string, encoding): + # unquote doesn't default to strict, fix that + return urllib.parse.unquote(string, encoding, errors='strict') + + +def fits_inside_codec(text, codec): + try: + text.encode(codec) + except UnicodeEncodeError: + return False + else: + return True + + +def ensure_charset(text, encoding): + if isinstance(text, bytes): + return text.decode(encoding) + else: + assert fits_inside_codec(text, encoding) + return text + + +def parse_headers(content_disposition, location=None, relaxed=False): + """Build a ContentDisposition from header values. + """ + + # We allow non-ascii here (it will only be parsed inside of qdtext, and + # rejected by the grammar if it appears in other places), although parsing + # it can be ambiguous. Parsing it ensures that a non-ambiguous filename* + # value won't get dismissed because of an unrelated ambiguity in the + # filename parameter. But it does mean we occasionally give + # less-than-certain values for some legacy senders. + content_disposition = ensure_charset(content_disposition, 'iso-8859-1') + + # Check the caller already did LWS-folding (normally done + # when separating header names and values; RFC 2616 section 2.2 + # says it should be done before interpretation at any rate). + # Hopefully space still means what it should in iso-8859-1. + # This check is a bit stronger that LWS folding, it will + # remove CR and LF even if they aren't part of a CRLF. + # However http doesn't allow isolated CR and LF in headers outside + # of LWS. + + # Relaxed has two effects (so far): + # the grammar allows a final ';' in the header; + # we do LWS-folding, and possibly normalise other broken + # whitespace, instead of rejecting non-lws-safe text. + # XXX Would prefer to accept only the quoted whitespace + # case, rather than normalising everything. + content_disposition = normalize_ws(content_disposition) + + try: + parsed = peg.parse(content_disposition, ContentDispositionValue) + except SyntaxError: + return ContentDisposition(location=location) + return ContentDisposition( + disposition=parsed.dtype, assocs=parsed.params, location=location) + + +def parse_ext_value(val): + charset = val[0] + if len(val) == 3: + charset, langtag, coded = val + else: + charset, coded = val + langtag = None + decoded = percent_decode(coded, encoding=charset) + return LangTagged(decoded, langtag) + + +# Currently pyPEG2 doesn't handle case-insensivitity: +# https://bitbucket.org/fdik/pypeg/issue/21/case-insensitive-keywords +class IKeyword(peg.Keyword): + def parse(self, parser, text, pos): + m = self.regex.match(text) + if m: + if m.group(0).upper() == str(self).upper(): + return text[len(str(self)):], None + else: + return text, SyntaxError("expecting " + repr(self)) + else: + return text, SyntaxError("expecting " + repr(self)) + + +# RFC 2616 +separator_chars = "()<>@,;:\\\"/[]?={} \t" +ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) +nontoken_chars = separator_chars + ctl_chars + +# RFC 5987 +attr_chars_nonalnum = '!#$&+-.^_`|~' +attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum + +# RFC 5987 gives this alternative construction of the token character class +token_chars = attr_chars + "*'%" + + +# Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 +# token was redefined from attr_chars to avoid using AnyBut, +# which might include non-ascii octets. +token_re = '[{}]+'.format(re.escape(token_chars)) +class Token(str): + grammar = re.compile(token_re) + + +# RFC 2616 says some linear whitespace (LWS) is in fact allowed in text +# and qdtext; however it also mentions folding that whitespace into +# a single SP (which isn't in CTL) before interpretation. +# Assume the caller already that folding when parsing headers. + +# NOTE: qdtext also allows non-ascii, which we choose to parse +# as ISO-8859-1; rejecting it entirely would also be permitted. +# Some broken browsers attempt encoding-sniffing, which is broken +# because the spec only allows iso, and because encoding-sniffing +# can mangle valid values. +# Everything else in this grammar (including RFC 5987 ext values) +# is in an ascii-safe encoding. +# Because of this, this is the only character class to use AnyBut, +# and all the others are defined with Any. + +qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) +quoted_pair_re = r'\\[{}]'.format(re.escape(''.join(chr(i) for i in range(128)))) + +class QuotedString(str): + grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) + + def __str__(self): + s = super().__str__() + s = s[1:-1] + s = re.sub(r'\\(.)', r'\1', s) + return s + + +class Value(str): + grammar = [re.compile(token_re), QuotedString] + +# Other charsets are forbidden, the spec reserves them +# for future evolutions. +class Charset(str): + grammar = re.compile('UTF-8|ISO-8859-1', re.I) + +class Language(str): + # XXX See RFC 5646 for the correct definition + grammar = re.compile('[A-Za-z0-9-]+') + +attr_char_re = '[{}]'.format(re.escape(attr_chars)) +hex_digit_re = '%[' + string.hexdigits + ']{2}' + +class ValueChars(str): + grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) + +class ExtValue(peg.List): + grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", ValueChars) + +class ExtToken(peg.Symbol): + regex = re.compile(token_re + r'\*') + + def __str__(self): + return super().__str__().lower() + +class NoExtToken(peg.Symbol): + regex = re.compile(token_re + r'(?