diff --git a/qutebrowser/misc/mhtml.py b/qutebrowser/misc/mhtml.py
index e2bb0a4a5..4eece95c4 100644
--- a/qutebrowser/misc/mhtml.py
+++ b/qutebrowser/misc/mhtml.py
@@ -52,8 +52,10 @@ def _get_css_imports(data):
The returned URLs are relative to the stylesheet's URL.
Args:
- data: The content of the stylesheet to scan as bytes.
+ data: The content of the stylesheet to scan.
"""
+ if isinstance(data, str):
+ data = data.encode("utf-8")
urls = []
for pattern in _CSS_URL_PATTERNS:
for match in pattern.finditer(data):
@@ -225,7 +227,7 @@ class _Downloader(object):
dest: Destination filename.
writer: The MHTMLWriter object which is used to save the page.
loaded_urls: A set of QUrls of finished asset downloads.
- pending_downloads: A set of unfinished DownloadItems.
+ pending_downloads: A set of unfinished (url, DownloadItem) tuples.
_finished: A flag indicating if the file has already been written.
"""
@@ -268,9 +270,28 @@ class _Downloader(object):
absolute_url = web_url.resolved(QUrl(element_url))
self.fetch_url(absolute_url)
+ styles = web_frame.findAllElements("style")
+ for style in styles:
+ if style.attribute("type", "text/css") != "text/css":
+ continue
+ for element_url in _get_css_imports(style.toPlainText()):
+ element_url = element_url.decode("ascii")
+ self.fetch_url(web_url.resolved(QUrl(element_url)))
+
+ # Search for references in inline styles
+ for element in web_frame.findAllElements("*"):
+ style = element.attribute("style")
+ if not style:
+ continue
+ for element_url in _get_css_imports(style):
+ element_url = element_url.decode("ascii")
+ self.fetch_url(web_url.resolved(QUrl(element_url)))
+
# Shortcut if no assets need to be downloaded, otherwise the file would
- # never be saved
- if not elements and not self.pending_downloads:
+ # never be saved. Also might happen if the downloads are fast enough to
+ # complete before connecting their finished signal.
+ self.collect_zombies()
+ if not self.pending_downloads and not self._finished:
self.finish_file()
def fetch_url(self, url):
@@ -279,6 +300,8 @@ class _Downloader(object):
Args:
url: The file to download as QUrl.
"""
+ if url.scheme() == "data":
+ return
# Prevent loading an asset twice
if url in self.loaded_urls:
return
@@ -288,9 +311,9 @@ class _Downloader(object):
download_manager = objreg.get("download-manager", scope="window",
window="current")
- item = download_manager.get(url, fileobj=io.BytesIO(),
+ item = download_manager.get(url, fileobj=_NoCloseBytesIO(),
auto_remove=True)
- self.pending_downloads.add(item)
+ self.pending_downloads.add((url, item))
item.finished.connect(
functools.partial(self.finished, url, item))
item.error.connect(
@@ -305,7 +328,7 @@ class _Downloader(object):
url: The original url of the asset as QUrl.
item: The DownloadItem given by the DownloadManager
"""
- self.pending_downloads.remove(item)
+ self.pending_downloads.remove((url, item))
mime = item.raw_headers.get(b"Content-Type", b"")
mime = mime.decode("ascii", "ignore")
@@ -319,6 +342,7 @@ class _Downloader(object):
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
encode)
+ item.fileobj.actual_close()
if self.pending_downloads:
return
self.finish_file()
@@ -330,7 +354,14 @@ class _Downloader(object):
url: The orignal url of the asset as QUrl.
item: The DownloadItem given by the DownloadManager.
"""
- self.pending_downloads.remove(item)
+ try:
+ self.pending_downloads.remove((url, item))
+ except KeyError:
+ # This might happen if .collect_zombies() calls .finished() and the
+ # error handler will be called after .collect_zombies
+ log.misc.debug("Oops! Download already gone: %s", item)
+ return
+ item.fileobj.actual_close()
self.writer.add_file(url.toString(), b"")
if self.pending_downloads:
return
@@ -339,6 +370,7 @@ class _Downloader(object):
def finish_file(self):
"""Save the file to the filename given in __init__."""
if self._finished:
+ log.misc.debug("finish_file called twice, ignored!")
return
self._finished = True
log.misc.debug("All assets downloaded, ready to finish off!")
@@ -346,6 +378,35 @@ class _Downloader(object):
self.writer.write_to(file_output)
message.info("current", "Page saved as {}".format(self.dest), True)
+ def collect_zombies(self):
+ """Collect done downloads and add their data to the MHTML file.
+
+ This is needed if a download finishes before attaching its
+ finished signal.
+ """
+ items = set((url, item) for url, item in self.pending_downloads
+ if item.done)
+ log.misc.debug("Zombie downloads: %s", items)
+ for url, item in items:
+ self.finished(url, item)
+
+
+class _NoCloseBytesIO(io.BytesIO):
+
+ """BytesIO that can't be .closed()
+
+ This is needed to prevent the downloadmanager from closing the stream, thus
+ discarding the data.
+ """
+
+ def close(self):
+ """Do nothing."""
+ pass
+
+ def actual_close(self):
+ """Close the stream."""
+ super().close()
+
def start_download(dest):
"""Start downloading the current page and all assets to a MHTML file.