Handle non-ASCII in headers/url better

This commit is contained in:
Daniel 2015-10-19 14:05:59 +02:00
parent 8bb887ddab
commit ae8a9b8798
3 changed files with 31 additions and 5 deletions

View File

@ -33,7 +33,7 @@ import email.mime.multipart
from PyQt5.QtCore import QUrl from PyQt5.QtCore import QUrl
from qutebrowser.browser import webelem from qutebrowser.browser import webelem
from qutebrowser.utils import log, objreg, message, usertypes from qutebrowser.utils import log, objreg, message, usertypes, utils, urlutils
try: try:
import cssutils import cssutils
@ -237,7 +237,7 @@ class _Downloader():
self.writer = MHTMLWriter( self.writer = MHTMLWriter(
web_frame.toHtml().encode('utf-8'), web_frame.toHtml().encode('utf-8'),
content_location=web_url.toString(), content_location=urlutils.encoded_url(web_url),
# I've found no way of getting the content type of a QWebView, but # I've found no way of getting the content type of a QWebView, but
# since we're using .toHtml, it's probably safe to say that the # since we're using .toHtml, it's probably safe to say that the
# content-type is HTML # content-type is HTML
@ -347,8 +347,12 @@ class _Downloader():
self.fetch_url(absolute_url) self.fetch_url(absolute_url)
encode = E_QUOPRI if mime.startswith('text/') else E_BASE64 encode = E_QUOPRI if mime.startswith('text/') else E_BASE64
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime, # Our MHTML handler refuses non-ASCII headers. This will replace every
encode) # non-ASCII char with '?'. This is probably okay, as official Content-
# Type headers contain ASCII only anyway. Anything else is madness.
mime = utils.force_encoding(mime, 'ascii')
self.writer.add_file(urlutils.encoded_url(url),
item.fileobj.getvalue(), mime, encode)
item.fileobj.actual_close() item.fileobj.actual_close()
if self.pending_downloads: if self.pending_downloads:
return return
@ -369,7 +373,7 @@ class _Downloader():
log.downloads.debug("Oops! Download already gone: %s", item) log.downloads.debug("Oops! Download already gone: %s", item)
return return
item.fileobj.actual_close() item.fileobj.actual_close()
self.writer.add_file(url.toString(), b'') self.writer.add_file(ulrutils.encoded_url(url), b'')
if self.pending_downloads: if self.pending_downloads:
return return
self.finish_file() self.finish_file()

View File

@ -438,6 +438,15 @@ def same_domain(url1, url2):
return domain1 == domain2 return domain1 == domain2
def encoded_url(url):
"""Return the fully encoded url as string.
Args:
url: The url to encode as QUrl.
"""
return bytes(url.toEncoded()).decode('ascii')
class IncDecError(Exception): class IncDecError(Exception):
"""Exception raised by incdec_number on problems. """Exception raised by incdec_number on problems.

View File

@ -527,6 +527,19 @@ def test_same_domain_invalid_url(url1, url2):
with pytest.raises(urlutils.InvalidUrlError): with pytest.raises(urlutils.InvalidUrlError):
urlutils.same_domain(QUrl(url1), QUrl(url2)) urlutils.same_domain(QUrl(url1), QUrl(url2))
@pytest.mark.parametrize('url, expected', [
('http://example.com', 'http://example.com'),
('http://ünicode.com', 'http://xn--nicode-2ya.com'),
('http://foo.bar/?header=text/pläin',
'http://foo.bar/?header=text/pl%C3%A4in'),
])
def test_encoded_url(url, expected):
"""Test encoded_url"""
url = QUrl(url)
assert urlutils.encoded_url(url) == expected
class TestIncDecNumber: class TestIncDecNumber:
"""Tests for urlutils.incdec_number().""" """Tests for urlutils.incdec_number()."""