use cssutils

This commit is contained in:
Daniel 2015-09-25 13:09:39 +02:00
parent 749b1c02cc
commit 420c087373
3 changed files with 64 additions and 12 deletions

View File

@ -35,6 +35,10 @@ from PyQt5.QtCore import QUrl
from qutebrowser.browser import webelem from qutebrowser.browser import webelem
from qutebrowser.utils import log, objreg, message from qutebrowser.utils import log, objreg, message
try:
import cssutils
except ImportError:
cssutils = None
_File = collections.namedtuple('_File', _File = collections.namedtuple('_File',
['content', 'content_type', 'content_location', ['content', 'content_type', 'content_location',
@ -50,7 +54,7 @@ _CSS_URL_PATTERNS = [re.compile(x) for x in [
]] ]]
def _get_css_imports(data): def _get_css_imports_regex(data):
"""Return all assets that are referenced in the given CSS document. """Return all assets that are referenced in the given CSS document.
The returned URLs are relative to the stylesheet's URL. The returned URLs are relative to the stylesheet's URL.
@ -67,6 +71,47 @@ def _get_css_imports(data):
return urls return urls
def _get_css_imports_cssutils(data, inline=False):
"""Return all assets that are referenced in the given CSS document.
The returned URLs are relative to the stylesheet's URL.
Args:
data: The content of the stylesheet to scan as string.
inline: True if the argument is a inline HTML style attribute.
"""
parser = cssutils.CSSParser(fetcher=lambda url: (None, ""), validate=False)
if not inline:
sheet = parser.parseString(data)
return list(cssutils.getUrls(sheet))
else:
urls = []
declaration = parser.parseStyle(data)
# prop = background, color, margin, ...
for prop in declaration:
# value = red, 10px, url(foobar), ...
for value in prop.propertyValue:
if isinstance(value, cssutils.css.URIValue):
if value.uri:
urls.append(value.uri)
return urls
def _get_css_imports(data, inline=False):
"""Return all assets that are referenced in the given CSS document.
The returned URLs are relative to the stylesheet's URL.
Args:
data: The content of the stylesheet to scan as string.
inline: True if the argument is a inline HTML style attribute.
"""
if cssutils is None:
return _get_css_imports_regex(data)
else:
return _get_css_imports_cssutils(data, inline)
MHTMLPolicy = email.policy.default.clone(linesep='\r\n', max_line_length=0) MHTMLPolicy = email.policy.default.clone(linesep='\r\n', max_line_length=0)
@ -228,7 +273,7 @@ class _Downloader():
for element in web_frame.findAllElements('[style]'): for element in web_frame.findAllElements('[style]'):
element = webelem.WebElementWrapper(element) element = webelem.WebElementWrapper(element)
style = element['style'] style = element['style']
for element_url in _get_css_imports(style): for element_url in _get_css_imports(style, inline=True):
self.fetch_url(web_url.resolved(QUrl(element_url))) self.fetch_url(web_url.resolved(QUrl(element_url)))
# Shortcut if no assets need to be downloaded, otherwise the file would # Shortcut if no assets need to be downloaded, otherwise the file would

View File

@ -5,3 +5,4 @@ pyPEG2==2.15.2
PyYAML==3.11 PyYAML==3.11
colorama==0.3.3 colorama==0.3.3
colorlog==2.6.0 colorlog==2.6.0
cssutils==1.0

View File

@ -250,20 +250,26 @@ def test_removing_file_from_mhtml(checker):
""") """)
@pytest.mark.parametrize('style, expected_urls', [ @pytest.mark.parametrize('has_cssutils', [True, False])
("@import 'default.css'", ['default.css']), @pytest.mark.parametrize('inline, style, expected_urls', [
('@import "default.css"', ['default.css']), (False, "@import 'default.css'", ['default.css']),
("@import \t 'tabbed.css'", ['tabbed.css']), (False, '@import "default.css"', ['default.css']),
("@import url('default.css')", ['default.css']), (False, "@import \t 'tabbed.css'", ['tabbed.css']),
("""body { (False, "@import url('default.css')", ['default.css']),
(False, """body {
background: url("/bg-img.png") background: url("/bg-img.png")
}""", ['/bg-img.png']), }""", ['/bg-img.png']),
('background: url(folder/file.png)', ['folder/file.png']), (True, 'background: url(folder/file.png) no-repeat', ['folder/file.png']),
('content: url()', []), (True, 'content: url()', []),
]) ])
def test_css_url_scanner(style, expected_urls): def test_css_url_scanner(monkeypatch, has_cssutils, inline, style,
expected_urls):
if has_cssutils:
assert mhtml.cssutils is not None
else:
monkeypatch.setattr('qutebrowser.browser.mhtml.cssutils', None)
expected_urls.sort() expected_urls.sort()
urls = mhtml._get_css_imports(style) urls = mhtml._get_css_imports(style, inline=inline)
urls.sort() urls.sort()
assert urls == expected_urls assert urls == expected_urls