Also load assets referenced in css files
Things like "@import stylesheet.css" and "url(...)".
This commit is contained in:
parent
6b086d159d
commit
11ed60620a
@ -22,6 +22,7 @@
|
|||||||
import functools
|
import functools
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
@ -37,6 +38,32 @@ _File = namedtuple("_File",
|
|||||||
"content content_type content_location transfer_encoding")
|
"content content_type content_location transfer_encoding")
|
||||||
|
|
||||||
|
|
||||||
|
_CSS_URL_PATTERNS = [re.compile(x) for x in [
|
||||||
|
rb"@import '(?P<url>[^']+)'",
|
||||||
|
rb'@import "(?P<url>[^"]+)"',
|
||||||
|
rb'''url\((?P<url>[^'"][^)]*)\)''',
|
||||||
|
rb'url\("(?P<url>[^"]+)"\)',
|
||||||
|
rb"url\('(?P<url>[^']+)'\)",
|
||||||
|
]]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_css_imports(data):
|
||||||
|
"""Return all assets that are referenced in the given CSS document.
|
||||||
|
|
||||||
|
The returned URLs are relative to the stylesheet's URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The content of the stylesheet to scan as bytes.
|
||||||
|
"""
|
||||||
|
urls = []
|
||||||
|
for pattern in _CSS_URL_PATTERNS:
|
||||||
|
for match in pattern.finditer(data):
|
||||||
|
url = match.group("url")
|
||||||
|
if url:
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def _chunked_base64(data, maxlen=76, linesep=b"\r\n"):
|
def _chunked_base64(data, maxlen=76, linesep=b"\r\n"):
|
||||||
"""Just like b64encode, except that it breaks long lines.
|
"""Just like b64encode, except that it breaks long lines.
|
||||||
|
|
||||||
@ -191,7 +218,7 @@ class _Downloader(object):
|
|||||||
self.web_view = web_view
|
self.web_view = web_view
|
||||||
self.dest = dest
|
self.dest = dest
|
||||||
self.writer = MHTMLWriter()
|
self.writer = MHTMLWriter()
|
||||||
self.loaded_urls = set()
|
self.loaded_urls = {web_view.url()}
|
||||||
self.pending_downloads = set()
|
self.pending_downloads = set()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -200,8 +227,6 @@ class _Downloader(object):
|
|||||||
The object must not be reused, you should create a new one if
|
The object must not be reused, you should create a new one if
|
||||||
you want to download another page.
|
you want to download another page.
|
||||||
"""
|
"""
|
||||||
download_manager = objreg.get("download-manager", scope="window",
|
|
||||||
window="current")
|
|
||||||
web_url_str = self.web_view.url().toString()
|
web_url_str = self.web_view.url().toString()
|
||||||
web_frame = self.web_view.page().mainFrame()
|
web_frame = self.web_view.page().mainFrame()
|
||||||
|
|
||||||
@ -225,22 +250,32 @@ class _Downloader(object):
|
|||||||
# Might be a local <script> tag or something else
|
# Might be a local <script> tag or something else
|
||||||
continue
|
continue
|
||||||
absolute_url = QUrl(urljoin(web_url_str, element_url))
|
absolute_url = QUrl(urljoin(web_url_str, element_url))
|
||||||
# Prevent loading an asset twice
|
self.fetch_url(absolute_url)
|
||||||
if absolute_url in self.loaded_urls:
|
|
||||||
continue
|
|
||||||
self.loaded_urls.add(absolute_url)
|
|
||||||
|
|
||||||
log.misc.debug("asset at %s", absolute_url)
|
def fetch_url(self, url):
|
||||||
|
"""Download the given url and add the file to the collection.
|
||||||
|
|
||||||
item = download_manager.get(absolute_url, fileobj=io.BytesIO(),
|
Args:
|
||||||
auto_remove=True)
|
url: The file to download as QUrl.
|
||||||
self.pending_downloads.add(item)
|
"""
|
||||||
item.finished.connect(
|
# Prevent loading an asset twice
|
||||||
functools.partial(self.finished, absolute_url, item))
|
if url in self.loaded_urls:
|
||||||
item.error.connect(
|
return
|
||||||
functools.partial(self.error, absolute_url, item))
|
self.loaded_urls.add(url)
|
||||||
item.cancelled.connect(
|
|
||||||
functools.partial(self.error, absolute_url, item))
|
log.misc.debug("loading asset at %s", url)
|
||||||
|
|
||||||
|
download_manager = objreg.get("download-manager", scope="window",
|
||||||
|
window="current")
|
||||||
|
item = download_manager.get(url, fileobj=io.BytesIO(),
|
||||||
|
auto_remove=True)
|
||||||
|
self.pending_downloads.add(item)
|
||||||
|
item.finished.connect(
|
||||||
|
functools.partial(self.finished, url, item))
|
||||||
|
item.error.connect(
|
||||||
|
functools.partial(self.error, url, item))
|
||||||
|
item.cancelled.connect(
|
||||||
|
functools.partial(self.error, url, item))
|
||||||
|
|
||||||
def finished(self, url, item):
|
def finished(self, url, item):
|
||||||
"""Callback when a single asset is downloaded.
|
"""Callback when a single asset is downloaded.
|
||||||
@ -252,6 +287,14 @@ class _Downloader(object):
|
|||||||
self.pending_downloads.remove(item)
|
self.pending_downloads.remove(item)
|
||||||
mime = item.raw_headers.get(b"Content-Type", b"")
|
mime = item.raw_headers.get(b"Content-Type", b"")
|
||||||
mime = mime.decode("ascii", "ignore")
|
mime = mime.decode("ascii", "ignore")
|
||||||
|
|
||||||
|
if mime.lower() == "text/css":
|
||||||
|
import_urls = _get_css_imports(item.fileobj.getvalue())
|
||||||
|
for import_url in import_urls:
|
||||||
|
import_url = import_url.decode("ascii")
|
||||||
|
absolute_url = QUrl(urljoin(url.toString(), import_url))
|
||||||
|
self.fetch_url(absolute_url)
|
||||||
|
|
||||||
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
|
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
|
||||||
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
|
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
|
||||||
encode)
|
encode)
|
||||||
|
Loading…
Reference in New Issue
Block a user