From c0535727ef4dda65e837b45ac1382b7c64ab5564 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 9 Nov 2015 22:28:24 +0100 Subject: [PATCH] Only download elements with rel={stylesheet,icon} Websites may set the rel attribute to whatever they want, so we just care about stylesheets and icons and not the other stuff like wss links (looking at you, GitHub), RSS-feeds, ... --- qutebrowser/browser/mhtml.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/qutebrowser/browser/mhtml.py b/qutebrowser/browser/mhtml.py index e7272cd78..bfdd1ae8f 100644 --- a/qutebrowser/browser/mhtml.py +++ b/qutebrowser/browser/mhtml.py @@ -119,6 +119,23 @@ def _get_css_imports(data, inline=False): return _get_css_imports_cssutils(data, inline) +def _check_rel(element): + """Return true if the element's rel attribute fits our criteria. + + rel has to contain 'stylesheet' or 'icon'. Also returns True if the rel + attribute is unset. + + Args: + element: The WebElementWrapper which should be checked. + """ + if 'rel' not in element: + return True + must_have = {'stylesheet', 'icon'} + rels = [rel.lower() for rel in element['rel'].split(' ')] + return any(rel in rels for rel in must_have) + + + MHTMLPolicy = email.policy.default.clone(linesep='\r\n', max_line_length=0) @@ -250,6 +267,10 @@ class _Downloader(): for element in elements: element = webelem.WebElementWrapper(element) + # Websites are free to set whatever rel=... attribute they want. + # We just care about stylesheets and icons. + if not _check_rel(element): + continue if 'src' in element: element_url = element['src'] elif 'href' in element: