_get_css_imports now works on strings only

This also means that it returns strings, making the calls to .decode unneeded.
2015-09-24 13:48:58 +02:00 · 2015-09-24 13:48:58 +02:00 · ba81332d45
commit ba81332d45
parent d3a21927f2
1 changed files with 20 additions and 14 deletions
--- a/qutebrowser/misc/mhtml.py
+++ b/qutebrowser/misc/mhtml.py
@ -42,11 +42,11 @@ _File = collections.namedtuple("_File",


 _CSS_URL_PATTERNS = [re.compile(x) for x in [
-    rb"@import '(?P<url>[^']+)'",
-    rb'@import "(?P<url>[^"]+)"',
-    rb'''url\((?P<url>[^'"][^)]*)\)''',
-    rb'url\("(?P<url>[^"]+)"\)',
-    rb"url\('(?P<url>[^']+)'\)",
+    r"@import '(?P<url>[^']+)'",
+    r'@import "(?P<url>[^"]+)"',
+    r'''url\((?P<url>[^'"][^)]*)\)''',
+    r'url\("(?P<url>[^"]+)"\)',
+    r"url\('(?P<url>[^']+)'\)",
 ]]


@ -56,10 +56,8 @@ def _get_css_imports(data):
    The returned URLs are relative to the stylesheet's URL.

    Args:
-        data: The content of the stylesheet to scan.
+        data: The content of the stylesheet to scan as string.
    """
-    if isinstance(data, str):
-        data = data.encode("utf-8")
    urls = []
    for pattern in _CSS_URL_PATTERNS:
        for match in pattern.finditer(data):
@ -246,11 +244,11 @@ class _Downloader():

        self.writer = MHTMLWriter(
            web_frame.toHtml().encode("utf-8"),
-            content_location = web_url.toString(),
+            content_location=web_url.toString(),
            # I've found no way of getting the content type of a QWebView, but
            # since we're using .toHtml, it's probably safe to say that the
            # content-type is HTML
-            content_type = 'text/html; charset="UTF-8"',
+            content_type='text/html; charset="UTF-8"',
        )
        # Currently only downloading <link> (stylesheets), <script>
        # (javascript) and <img> (image) elements.
@ -276,7 +274,6 @@ class _Downloader():
            if "type" in style and style["type"] != "text/css":
                continue
            for element_url in _get_css_imports(str(style)):
-                element_url = element_url.decode("ascii")
                self.fetch_url(web_url.resolved(QUrl(element_url)))

        # Search for references in inline styles
@ -286,7 +283,6 @@ class _Downloader():
                continue
            style = element["style"]
            for element_url in _get_css_imports(style):
-                element_url = element_url.decode("ascii")
                self.fetch_url(web_url.resolved(QUrl(element_url)))

        # Shortcut if no assets need to be downloaded, otherwise the file would
@ -335,9 +331,19 @@ class _Downloader():
        mime = mime.decode("ascii", "ignore")

        if mime.lower() == "text/css":
-            import_urls = _get_css_imports(item.fileobj.getvalue())
+            # We can't always assume that CSS files are UTF-8, but CSS files
+            # shouldn't contain many non-ASCII characters anyway (in most
+            # cases). Using "ignore" lets us decode the file even if it's
+            # invalid UTF-8 data.
+            # The file written to the MHTML file won't be modified by this
+            # decoding, since there we're taking the original bytestream.
+            try:
+                css_string = item.fileobj.getvalue().decode("utf-8")
+            except UnicodeDecodeError:
+                log.misc.warning("Invalid UTF-8 data in %s", url)
+                css_string = item.fileobj.getvalue().decode("utf-8", "ignore")
+            import_urls = _get_css_imports(css_string)
            for import_url in import_urls:
-                import_url = import_url.decode("ascii")
                absolute_url = url.resolved(QUrl(import_url))
                self.fetch_url(absolute_url)