Also load assets referenced in css files

Things like "@import stylesheet.css" and "url(...)".
This commit is contained in:
Daniel 2015-09-22 18:48:52 +02:00
parent 6b086d159d
commit 11ed60620a

View File

@ -22,6 +22,7 @@
import functools import functools
import io import io
import os import os
import re
from collections import namedtuple from collections import namedtuple
from base64 import b64encode from base64 import b64encode
@ -37,6 +38,32 @@ _File = namedtuple("_File",
"content content_type content_location transfer_encoding") "content content_type content_location transfer_encoding")
_CSS_URL_PATTERNS = [re.compile(x) for x in [
rb"@import '(?P<url>[^']+)'",
rb'@import "(?P<url>[^"]+)"',
rb'''url\((?P<url>[^'"][^)]*)\)''',
rb'url\("(?P<url>[^"]+)"\)',
rb"url\('(?P<url>[^']+)'\)",
]]
def _get_css_imports(data):
"""Return all assets that are referenced in the given CSS document.
The returned URLs are relative to the stylesheet's URL.
Args:
data: The content of the stylesheet to scan as bytes.
"""
urls = []
for pattern in _CSS_URL_PATTERNS:
for match in pattern.finditer(data):
url = match.group("url")
if url:
urls.append(url)
return urls
def _chunked_base64(data, maxlen=76, linesep=b"\r\n"): def _chunked_base64(data, maxlen=76, linesep=b"\r\n"):
"""Just like b64encode, except that it breaks long lines. """Just like b64encode, except that it breaks long lines.
@ -191,7 +218,7 @@ class _Downloader(object):
self.web_view = web_view self.web_view = web_view
self.dest = dest self.dest = dest
self.writer = MHTMLWriter() self.writer = MHTMLWriter()
self.loaded_urls = set() self.loaded_urls = {web_view.url()}
self.pending_downloads = set() self.pending_downloads = set()
def run(self): def run(self):
@ -200,8 +227,6 @@ class _Downloader(object):
The object must not be reused, you should create a new one if The object must not be reused, you should create a new one if
you want to download another page. you want to download another page.
""" """
download_manager = objreg.get("download-manager", scope="window",
window="current")
web_url_str = self.web_view.url().toString() web_url_str = self.web_view.url().toString()
web_frame = self.web_view.page().mainFrame() web_frame = self.web_view.page().mainFrame()
@ -225,22 +250,32 @@ class _Downloader(object):
# Might be a local <script> tag or something else # Might be a local <script> tag or something else
continue continue
absolute_url = QUrl(urljoin(web_url_str, element_url)) absolute_url = QUrl(urljoin(web_url_str, element_url))
# Prevent loading an asset twice self.fetch_url(absolute_url)
if absolute_url in self.loaded_urls:
continue
self.loaded_urls.add(absolute_url)
log.misc.debug("asset at %s", absolute_url) def fetch_url(self, url):
"""Download the given url and add the file to the collection.
item = download_manager.get(absolute_url, fileobj=io.BytesIO(), Args:
auto_remove=True) url: The file to download as QUrl.
self.pending_downloads.add(item) """
item.finished.connect( # Prevent loading an asset twice
functools.partial(self.finished, absolute_url, item)) if url in self.loaded_urls:
item.error.connect( return
functools.partial(self.error, absolute_url, item)) self.loaded_urls.add(url)
item.cancelled.connect(
functools.partial(self.error, absolute_url, item)) log.misc.debug("loading asset at %s", url)
download_manager = objreg.get("download-manager", scope="window",
window="current")
item = download_manager.get(url, fileobj=io.BytesIO(),
auto_remove=True)
self.pending_downloads.add(item)
item.finished.connect(
functools.partial(self.finished, url, item))
item.error.connect(
functools.partial(self.error, url, item))
item.cancelled.connect(
functools.partial(self.error, url, item))
def finished(self, url, item): def finished(self, url, item):
"""Callback when a single asset is downloaded. """Callback when a single asset is downloaded.
@ -252,6 +287,14 @@ class _Downloader(object):
self.pending_downloads.remove(item) self.pending_downloads.remove(item)
mime = item.raw_headers.get(b"Content-Type", b"") mime = item.raw_headers.get(b"Content-Type", b"")
mime = mime.decode("ascii", "ignore") mime = mime.decode("ascii", "ignore")
if mime.lower() == "text/css":
import_urls = _get_css_imports(item.fileobj.getvalue())
for import_url in import_urls:
import_url = import_url.decode("ascii")
absolute_url = QUrl(urljoin(url.toString(), import_url))
self.fetch_url(absolute_url)
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64 encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime, self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
encode) encode)