Also scan CSS in <style> tags and inline CSS
As both may contain external links too (@import, url(...))
This commit is contained in:
parent
02c1fa1232
commit
8eafa1a105
@ -52,8 +52,10 @@ def _get_css_imports(data):
|
|||||||
The returned URLs are relative to the stylesheet's URL.
|
The returned URLs are relative to the stylesheet's URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data: The content of the stylesheet to scan as bytes.
|
data: The content of the stylesheet to scan.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode("utf-8")
|
||||||
urls = []
|
urls = []
|
||||||
for pattern in _CSS_URL_PATTERNS:
|
for pattern in _CSS_URL_PATTERNS:
|
||||||
for match in pattern.finditer(data):
|
for match in pattern.finditer(data):
|
||||||
@ -225,7 +227,7 @@ class _Downloader(object):
|
|||||||
dest: Destination filename.
|
dest: Destination filename.
|
||||||
writer: The MHTMLWriter object which is used to save the page.
|
writer: The MHTMLWriter object which is used to save the page.
|
||||||
loaded_urls: A set of QUrls of finished asset downloads.
|
loaded_urls: A set of QUrls of finished asset downloads.
|
||||||
pending_downloads: A set of unfinished DownloadItems.
|
pending_downloads: A set of unfinished (url, DownloadItem) tuples.
|
||||||
_finished: A flag indicating if the file has already been written.
|
_finished: A flag indicating if the file has already been written.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -268,9 +270,28 @@ class _Downloader(object):
|
|||||||
absolute_url = web_url.resolved(QUrl(element_url))
|
absolute_url = web_url.resolved(QUrl(element_url))
|
||||||
self.fetch_url(absolute_url)
|
self.fetch_url(absolute_url)
|
||||||
|
|
||||||
|
styles = web_frame.findAllElements("style")
|
||||||
|
for style in styles:
|
||||||
|
if style.attribute("type", "text/css") != "text/css":
|
||||||
|
continue
|
||||||
|
for element_url in _get_css_imports(style.toPlainText()):
|
||||||
|
element_url = element_url.decode("ascii")
|
||||||
|
self.fetch_url(web_url.resolved(QUrl(element_url)))
|
||||||
|
|
||||||
|
# Search for references in inline styles
|
||||||
|
for element in web_frame.findAllElements("*"):
|
||||||
|
style = element.attribute("style")
|
||||||
|
if not style:
|
||||||
|
continue
|
||||||
|
for element_url in _get_css_imports(style):
|
||||||
|
element_url = element_url.decode("ascii")
|
||||||
|
self.fetch_url(web_url.resolved(QUrl(element_url)))
|
||||||
|
|
||||||
# Shortcut if no assets need to be downloaded, otherwise the file would
|
# Shortcut if no assets need to be downloaded, otherwise the file would
|
||||||
# never be saved
|
# never be saved. Also might happen if the downloads are fast enough to
|
||||||
if not elements and not self.pending_downloads:
|
# complete before connecting their finished signal.
|
||||||
|
self.collect_zombies()
|
||||||
|
if not self.pending_downloads and not self._finished:
|
||||||
self.finish_file()
|
self.finish_file()
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
@ -279,6 +300,8 @@ class _Downloader(object):
|
|||||||
Args:
|
Args:
|
||||||
url: The file to download as QUrl.
|
url: The file to download as QUrl.
|
||||||
"""
|
"""
|
||||||
|
if url.scheme() == "data":
|
||||||
|
return
|
||||||
# Prevent loading an asset twice
|
# Prevent loading an asset twice
|
||||||
if url in self.loaded_urls:
|
if url in self.loaded_urls:
|
||||||
return
|
return
|
||||||
@ -288,9 +311,9 @@ class _Downloader(object):
|
|||||||
|
|
||||||
download_manager = objreg.get("download-manager", scope="window",
|
download_manager = objreg.get("download-manager", scope="window",
|
||||||
window="current")
|
window="current")
|
||||||
item = download_manager.get(url, fileobj=io.BytesIO(),
|
item = download_manager.get(url, fileobj=_NoCloseBytesIO(),
|
||||||
auto_remove=True)
|
auto_remove=True)
|
||||||
self.pending_downloads.add(item)
|
self.pending_downloads.add((url, item))
|
||||||
item.finished.connect(
|
item.finished.connect(
|
||||||
functools.partial(self.finished, url, item))
|
functools.partial(self.finished, url, item))
|
||||||
item.error.connect(
|
item.error.connect(
|
||||||
@ -305,7 +328,7 @@ class _Downloader(object):
|
|||||||
url: The original url of the asset as QUrl.
|
url: The original url of the asset as QUrl.
|
||||||
item: The DownloadItem given by the DownloadManager
|
item: The DownloadItem given by the DownloadManager
|
||||||
"""
|
"""
|
||||||
self.pending_downloads.remove(item)
|
self.pending_downloads.remove((url, item))
|
||||||
mime = item.raw_headers.get(b"Content-Type", b"")
|
mime = item.raw_headers.get(b"Content-Type", b"")
|
||||||
mime = mime.decode("ascii", "ignore")
|
mime = mime.decode("ascii", "ignore")
|
||||||
|
|
||||||
@ -319,6 +342,7 @@ class _Downloader(object):
|
|||||||
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
|
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
|
||||||
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
|
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
|
||||||
encode)
|
encode)
|
||||||
|
item.fileobj.actual_close()
|
||||||
if self.pending_downloads:
|
if self.pending_downloads:
|
||||||
return
|
return
|
||||||
self.finish_file()
|
self.finish_file()
|
||||||
@ -330,7 +354,14 @@ class _Downloader(object):
|
|||||||
url: The orignal url of the asset as QUrl.
|
url: The orignal url of the asset as QUrl.
|
||||||
item: The DownloadItem given by the DownloadManager.
|
item: The DownloadItem given by the DownloadManager.
|
||||||
"""
|
"""
|
||||||
self.pending_downloads.remove(item)
|
try:
|
||||||
|
self.pending_downloads.remove((url, item))
|
||||||
|
except KeyError:
|
||||||
|
# This might happen if .collect_zombies() calls .finished() and the
|
||||||
|
# error handler will be called after .collect_zombies
|
||||||
|
log.misc.debug("Oops! Download already gone: %s", item)
|
||||||
|
return
|
||||||
|
item.fileobj.actual_close()
|
||||||
self.writer.add_file(url.toString(), b"")
|
self.writer.add_file(url.toString(), b"")
|
||||||
if self.pending_downloads:
|
if self.pending_downloads:
|
||||||
return
|
return
|
||||||
@ -339,6 +370,7 @@ class _Downloader(object):
|
|||||||
def finish_file(self):
|
def finish_file(self):
|
||||||
"""Save the file to the filename given in __init__."""
|
"""Save the file to the filename given in __init__."""
|
||||||
if self._finished:
|
if self._finished:
|
||||||
|
log.misc.debug("finish_file called twice, ignored!")
|
||||||
return
|
return
|
||||||
self._finished = True
|
self._finished = True
|
||||||
log.misc.debug("All assets downloaded, ready to finish off!")
|
log.misc.debug("All assets downloaded, ready to finish off!")
|
||||||
@ -346,6 +378,35 @@ class _Downloader(object):
|
|||||||
self.writer.write_to(file_output)
|
self.writer.write_to(file_output)
|
||||||
message.info("current", "Page saved as {}".format(self.dest), True)
|
message.info("current", "Page saved as {}".format(self.dest), True)
|
||||||
|
|
||||||
|
def collect_zombies(self):
|
||||||
|
"""Collect done downloads and add their data to the MHTML file.
|
||||||
|
|
||||||
|
This is needed if a download finishes before attaching its
|
||||||
|
finished signal.
|
||||||
|
"""
|
||||||
|
items = set((url, item) for url, item in self.pending_downloads
|
||||||
|
if item.done)
|
||||||
|
log.misc.debug("Zombie downloads: %s", items)
|
||||||
|
for url, item in items:
|
||||||
|
self.finished(url, item)
|
||||||
|
|
||||||
|
|
||||||
|
class _NoCloseBytesIO(io.BytesIO):
|
||||||
|
|
||||||
|
"""BytesIO that can't be .closed()
|
||||||
|
|
||||||
|
This is needed to prevent the downloadmanager from closing the stream, thus
|
||||||
|
discarding the data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Do nothing."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def actual_close(self):
|
||||||
|
"""Close the stream."""
|
||||||
|
super().close()
|
||||||
|
|
||||||
|
|
||||||
def start_download(dest):
|
def start_download(dest):
|
||||||
"""Start downloading the current page and all assets to a MHTML file.
|
"""Start downloading the current page and all assets to a MHTML file.
|
||||||
|
Loading…
Reference in New Issue
Block a user