diff --git a/qutebrowser/browser/mhtml.py b/qutebrowser/browser/mhtml.py index e7bdca516..82bb20ed0 100644 --- a/qutebrowser/browser/mhtml.py +++ b/qutebrowser/browser/mhtml.py @@ -33,7 +33,7 @@ import email.mime.multipart from PyQt5.QtCore import QUrl from qutebrowser.browser import webelem -from qutebrowser.utils import log, objreg, message, usertypes +from qutebrowser.utils import log, objreg, message, usertypes, utils, urlutils try: import cssutils @@ -237,7 +237,7 @@ class _Downloader(): self.writer = MHTMLWriter( web_frame.toHtml().encode('utf-8'), - content_location=web_url.toString(), + content_location=urlutils.encoded_url(web_url), # I've found no way of getting the content type of a QWebView, but # since we're using .toHtml, it's probably safe to say that the # content-type is HTML @@ -347,8 +347,12 @@ class _Downloader(): self.fetch_url(absolute_url) encode = E_QUOPRI if mime.startswith('text/') else E_BASE64 - self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime, - encode) + # Our MHTML handler refuses non-ASCII headers. This will replace every + # non-ASCII char with '?'. This is probably okay, as official Content- + # Type headers contain ASCII only anyway. Anything else is madness. + mime = utils.force_encoding(mime, 'ascii') + self.writer.add_file(urlutils.encoded_url(url), + item.fileobj.getvalue(), mime, encode) item.fileobj.actual_close() if self.pending_downloads: return @@ -369,7 +373,7 @@ class _Downloader(): log.downloads.debug("Oops! Download already gone: %s", item) return item.fileobj.actual_close() - self.writer.add_file(url.toString(), b'') + self.writer.add_file(ulrutils.encoded_url(url), b'') if self.pending_downloads: return self.finish_file() diff --git a/qutebrowser/utils/urlutils.py b/qutebrowser/utils/urlutils.py index e372dc65f..01c366c25 100644 --- a/qutebrowser/utils/urlutils.py +++ b/qutebrowser/utils/urlutils.py @@ -438,6 +438,15 @@ def same_domain(url1, url2): return domain1 == domain2 +def encoded_url(url): + """Return the fully encoded url as string. + + Args: + url: The url to encode as QUrl. + """ + return bytes(url.toEncoded()).decode('ascii') + + class IncDecError(Exception): """Exception raised by incdec_number on problems. diff --git a/tests/unit/utils/test_urlutils.py b/tests/unit/utils/test_urlutils.py index 4a19df689..5496a019b 100644 --- a/tests/unit/utils/test_urlutils.py +++ b/tests/unit/utils/test_urlutils.py @@ -527,6 +527,19 @@ def test_same_domain_invalid_url(url1, url2): with pytest.raises(urlutils.InvalidUrlError): urlutils.same_domain(QUrl(url1), QUrl(url2)) + +@pytest.mark.parametrize('url, expected', [ + ('http://example.com', 'http://example.com'), + ('http://ünicode.com', 'http://xn--nicode-2ya.com'), + ('http://foo.bar/?header=text/pläin', + 'http://foo.bar/?header=text/pl%C3%A4in'), +]) +def test_encoded_url(url, expected): + """Test encoded_url""" + url = QUrl(url) + assert urlutils.encoded_url(url) == expected + + class TestIncDecNumber: """Tests for urlutils.incdec_number()."""