_get_css_imports now works on strings only

This also means that it returns strings, making the calls to .decode
unneeded.
This commit is contained in:
Daniel 2015-09-24 13:48:58 +02:00
parent d3a21927f2
commit ba81332d45

View File

@ -42,11 +42,11 @@ _File = collections.namedtuple("_File",
_CSS_URL_PATTERNS = [re.compile(x) for x in [ _CSS_URL_PATTERNS = [re.compile(x) for x in [
rb"@import '(?P<url>[^']+)'", r"@import '(?P<url>[^']+)'",
rb'@import "(?P<url>[^"]+)"', r'@import "(?P<url>[^"]+)"',
rb'''url\((?P<url>[^'"][^)]*)\)''', r'''url\((?P<url>[^'"][^)]*)\)''',
rb'url\("(?P<url>[^"]+)"\)', r'url\("(?P<url>[^"]+)"\)',
rb"url\('(?P<url>[^']+)'\)", r"url\('(?P<url>[^']+)'\)",
]] ]]
@ -56,10 +56,8 @@ def _get_css_imports(data):
The returned URLs are relative to the stylesheet's URL. The returned URLs are relative to the stylesheet's URL.
Args: Args:
data: The content of the stylesheet to scan. data: The content of the stylesheet to scan as string.
""" """
if isinstance(data, str):
data = data.encode("utf-8")
urls = [] urls = []
for pattern in _CSS_URL_PATTERNS: for pattern in _CSS_URL_PATTERNS:
for match in pattern.finditer(data): for match in pattern.finditer(data):
@ -246,11 +244,11 @@ class _Downloader():
self.writer = MHTMLWriter( self.writer = MHTMLWriter(
web_frame.toHtml().encode("utf-8"), web_frame.toHtml().encode("utf-8"),
content_location = web_url.toString(), content_location=web_url.toString(),
# I've found no way of getting the content type of a QWebView, but # I've found no way of getting the content type of a QWebView, but
# since we're using .toHtml, it's probably safe to say that the # since we're using .toHtml, it's probably safe to say that the
# content-type is HTML # content-type is HTML
content_type = 'text/html; charset="UTF-8"', content_type='text/html; charset="UTF-8"',
) )
# Currently only downloading <link> (stylesheets), <script> # Currently only downloading <link> (stylesheets), <script>
# (javascript) and <img> (image) elements. # (javascript) and <img> (image) elements.
@ -276,7 +274,6 @@ class _Downloader():
if "type" in style and style["type"] != "text/css": if "type" in style and style["type"] != "text/css":
continue continue
for element_url in _get_css_imports(str(style)): for element_url in _get_css_imports(str(style)):
element_url = element_url.decode("ascii")
self.fetch_url(web_url.resolved(QUrl(element_url))) self.fetch_url(web_url.resolved(QUrl(element_url)))
# Search for references in inline styles # Search for references in inline styles
@ -286,7 +283,6 @@ class _Downloader():
continue continue
style = element["style"] style = element["style"]
for element_url in _get_css_imports(style): for element_url in _get_css_imports(style):
element_url = element_url.decode("ascii")
self.fetch_url(web_url.resolved(QUrl(element_url))) self.fetch_url(web_url.resolved(QUrl(element_url)))
# Shortcut if no assets need to be downloaded, otherwise the file would # Shortcut if no assets need to be downloaded, otherwise the file would
@ -335,9 +331,19 @@ class _Downloader():
mime = mime.decode("ascii", "ignore") mime = mime.decode("ascii", "ignore")
if mime.lower() == "text/css": if mime.lower() == "text/css":
import_urls = _get_css_imports(item.fileobj.getvalue()) # We can't always assume that CSS files are UTF-8, but CSS files
# shouldn't contain many non-ASCII characters anyway (in most
# cases). Using "ignore" lets us decode the file even if it's
# invalid UTF-8 data.
# The file written to the MHTML file won't be modified by this
# decoding, since there we're taking the original bytestream.
try:
css_string = item.fileobj.getvalue().decode("utf-8")
except UnicodeDecodeError:
log.misc.warning("Invalid UTF-8 data in %s", url)
css_string = item.fileobj.getvalue().decode("utf-8", "ignore")
import_urls = _get_css_imports(css_string)
for import_url in import_urls: for import_url in import_urls:
import_url = import_url.decode("ascii")
absolute_url = url.resolved(QUrl(import_url)) absolute_url = url.resolved(QUrl(import_url))
self.fetch_url(absolute_url) self.fetch_url(absolute_url)