Also scan CSS in <style> tags and inline CSS

As both may contain external links too (@import, url(...))
This commit is contained in:
Daniel 2015-09-23 16:18:41 +02:00
parent 02c1fa1232
commit 8eafa1a105

View File

@ -52,8 +52,10 @@ def _get_css_imports(data):
The returned URLs are relative to the stylesheet's URL.
Args:
data: The content of the stylesheet to scan as bytes.
data: The content of the stylesheet to scan.
"""
if isinstance(data, str):
data = data.encode("utf-8")
urls = []
for pattern in _CSS_URL_PATTERNS:
for match in pattern.finditer(data):
@ -225,7 +227,7 @@ class _Downloader(object):
dest: Destination filename.
writer: The MHTMLWriter object which is used to save the page.
loaded_urls: A set of QUrls of finished asset downloads.
pending_downloads: A set of unfinished DownloadItems.
pending_downloads: A set of unfinished (url, DownloadItem) tuples.
_finished: A flag indicating if the file has already been written.
"""
@ -268,9 +270,28 @@ class _Downloader(object):
absolute_url = web_url.resolved(QUrl(element_url))
self.fetch_url(absolute_url)
styles = web_frame.findAllElements("style")
for style in styles:
if style.attribute("type", "text/css") != "text/css":
continue
for element_url in _get_css_imports(style.toPlainText()):
element_url = element_url.decode("ascii")
self.fetch_url(web_url.resolved(QUrl(element_url)))
# Search for references in inline styles
for element in web_frame.findAllElements("*"):
style = element.attribute("style")
if not style:
continue
for element_url in _get_css_imports(style):
element_url = element_url.decode("ascii")
self.fetch_url(web_url.resolved(QUrl(element_url)))
# Shortcut if no assets need to be downloaded, otherwise the file would
# never be saved
if not elements and not self.pending_downloads:
# never be saved. Also might happen if the downloads are fast enough to
# complete before connecting their finished signal.
self.collect_zombies()
if not self.pending_downloads and not self._finished:
self.finish_file()
def fetch_url(self, url):
@ -279,6 +300,8 @@ class _Downloader(object):
Args:
url: The file to download as QUrl.
"""
if url.scheme() == "data":
return
# Prevent loading an asset twice
if url in self.loaded_urls:
return
@ -288,9 +311,9 @@ class _Downloader(object):
download_manager = objreg.get("download-manager", scope="window",
window="current")
item = download_manager.get(url, fileobj=io.BytesIO(),
item = download_manager.get(url, fileobj=_NoCloseBytesIO(),
auto_remove=True)
self.pending_downloads.add(item)
self.pending_downloads.add((url, item))
item.finished.connect(
functools.partial(self.finished, url, item))
item.error.connect(
@ -305,7 +328,7 @@ class _Downloader(object):
url: The original url of the asset as QUrl.
item: The DownloadItem given by the DownloadManager
"""
self.pending_downloads.remove(item)
self.pending_downloads.remove((url, item))
mime = item.raw_headers.get(b"Content-Type", b"")
mime = mime.decode("ascii", "ignore")
@ -319,6 +342,7 @@ class _Downloader(object):
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
encode)
item.fileobj.actual_close()
if self.pending_downloads:
return
self.finish_file()
@ -330,7 +354,14 @@ class _Downloader(object):
url: The orignal url of the asset as QUrl.
item: The DownloadItem given by the DownloadManager.
"""
self.pending_downloads.remove(item)
try:
self.pending_downloads.remove((url, item))
except KeyError:
# This might happen if .collect_zombies() calls .finished() and the
# error handler will be called after .collect_zombies
log.misc.debug("Oops! Download already gone: %s", item)
return
item.fileobj.actual_close()
self.writer.add_file(url.toString(), b"")
if self.pending_downloads:
return
@ -339,6 +370,7 @@ class _Downloader(object):
def finish_file(self):
"""Save the file to the filename given in __init__."""
if self._finished:
log.misc.debug("finish_file called twice, ignored!")
return
self._finished = True
log.misc.debug("All assets downloaded, ready to finish off!")
@ -346,6 +378,35 @@ class _Downloader(object):
self.writer.write_to(file_output)
message.info("current", "Page saved as {}".format(self.dest), True)
def collect_zombies(self):
"""Collect done downloads and add their data to the MHTML file.
This is needed if a download finishes before attaching its
finished signal.
"""
items = set((url, item) for url, item in self.pending_downloads
if item.done)
log.misc.debug("Zombie downloads: %s", items)
for url, item in items:
self.finished(url, item)
class _NoCloseBytesIO(io.BytesIO):
"""BytesIO that can't be .closed()
This is needed to prevent the downloadmanager from closing the stream, thus
discarding the data.
"""
def close(self):
"""Do nothing."""
pass
def actual_close(self):
"""Close the stream."""
super().close()
def start_download(dest):
"""Start downloading the current page and all assets to a MHTML file.