From 11ed60620ae2630a4654e47e718f622797e8e175 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 22 Sep 2015 18:48:52 +0200 Subject: [PATCH] Also load assets referenced in css files Things like "@import stylesheet.css" and "url(...)". --- qutebrowser/misc/mhtml.py | 77 ++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/qutebrowser/misc/mhtml.py b/qutebrowser/misc/mhtml.py index 2353602fc..b45f0f6a9 100644 --- a/qutebrowser/misc/mhtml.py +++ b/qutebrowser/misc/mhtml.py @@ -22,6 +22,7 @@ import functools import io import os +import re from collections import namedtuple from base64 import b64encode @@ -37,6 +38,32 @@ _File = namedtuple("_File", "content content_type content_location transfer_encoding") +_CSS_URL_PATTERNS = [re.compile(x) for x in [ + rb"@import '(?P[^']+)'", + rb'@import "(?P[^"]+)"', + rb'''url\((?P[^'"][^)]*)\)''', + rb'url\("(?P[^"]+)"\)', + rb"url\('(?P[^']+)'\)", +]] + + +def _get_css_imports(data): + """Return all assets that are referenced in the given CSS document. + + The returned URLs are relative to the stylesheet's URL. + + Args: + data: The content of the stylesheet to scan as bytes. + """ + urls = [] + for pattern in _CSS_URL_PATTERNS: + for match in pattern.finditer(data): + url = match.group("url") + if url: + urls.append(url) + return urls + + def _chunked_base64(data, maxlen=76, linesep=b"\r\n"): """Just like b64encode, except that it breaks long lines. @@ -191,7 +218,7 @@ class _Downloader(object): self.web_view = web_view self.dest = dest self.writer = MHTMLWriter() - self.loaded_urls = set() + self.loaded_urls = {web_view.url()} self.pending_downloads = set() def run(self): @@ -200,8 +227,6 @@ class _Downloader(object): The object must not be reused, you should create a new one if you want to download another page. """ - download_manager = objreg.get("download-manager", scope="window", - window="current") web_url_str = self.web_view.url().toString() web_frame = self.web_view.page().mainFrame() @@ -225,22 +250,32 @@ class _Downloader(object): # Might be a local