2015-09-19 20:06:15 +02:00
|
|
|
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
|
|
|
|
|
|
|
|
# Copyright 2015 Daniel Schadt
|
|
|
|
#
|
|
|
|
# This file is part of qutebrowser.
|
|
|
|
#
|
|
|
|
# qutebrowser is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# qutebrowser is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
"""Utils for writing a MHTML file."""
|
|
|
|
|
|
|
|
import functools
|
|
|
|
import io
|
2015-09-22 13:37:03 +02:00
|
|
|
import os
|
2015-09-22 18:48:52 +02:00
|
|
|
import re
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-24 12:52:10 +02:00
|
|
|
import collections
|
|
|
|
import uuid
|
2015-09-24 14:07:12 +02:00
|
|
|
from email import policy, generator, encoders
|
2015-09-24 12:52:10 +02:00
|
|
|
from email.mime import multipart
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
from PyQt5.QtCore import QUrl
|
|
|
|
|
2015-09-24 13:22:53 +02:00
|
|
|
from qutebrowser.browser import webelem
|
2015-09-20 20:44:32 +02:00
|
|
|
from qutebrowser.utils import log, objreg, message
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
|
2015-09-24 12:52:10 +02:00
|
|
|
_File = collections.namedtuple("_File",
|
|
|
|
["content", "content_type", "content_location",
|
|
|
|
"transfer_encoding"])
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
|
2015-09-22 18:48:52 +02:00
|
|
|
_CSS_URL_PATTERNS = [re.compile(x) for x in [
|
2015-09-24 13:48:58 +02:00
|
|
|
r"@import '(?P<url>[^']+)'",
|
|
|
|
r'@import "(?P<url>[^"]+)"',
|
|
|
|
r'''url\((?P<url>[^'"][^)]*)\)''',
|
|
|
|
r'url\("(?P<url>[^"]+)"\)',
|
|
|
|
r"url\('(?P<url>[^']+)'\)",
|
2015-09-22 18:48:52 +02:00
|
|
|
]]
|
|
|
|
|
|
|
|
|
|
|
|
def _get_css_imports(data):
|
|
|
|
"""Return all assets that are referenced in the given CSS document.
|
|
|
|
|
|
|
|
The returned URLs are relative to the stylesheet's URL.
|
|
|
|
|
|
|
|
Args:
|
2015-09-24 13:48:58 +02:00
|
|
|
data: The content of the stylesheet to scan as string.
|
2015-09-22 18:48:52 +02:00
|
|
|
"""
|
|
|
|
urls = []
|
|
|
|
for pattern in _CSS_URL_PATTERNS:
|
|
|
|
for match in pattern.finditer(data):
|
|
|
|
url = match.group("url")
|
|
|
|
if url:
|
|
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
|
|
|
2015-09-23 18:10:14 +02:00
|
|
|
MHTMLPolicy = policy.default.clone(linesep="\r\n", max_line_length=0)
|
|
|
|
|
|
|
|
|
2015-09-24 14:07:12 +02:00
|
|
|
E_BASE64 = encoders.encode_base64
|
2015-09-19 20:06:15 +02:00
|
|
|
"""Encode the file using base64 encoding"""
|
|
|
|
|
2015-09-24 14:07:12 +02:00
|
|
|
E_QUOPRI = encoders.encode_quopri
|
2015-09-20 14:30:12 +02:00
|
|
|
"""Encode the file using MIME quoted-printable encoding."""
|
|
|
|
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-24 13:22:53 +02:00
|
|
|
class MHTMLWriter():
|
2015-09-20 17:11:28 +02:00
|
|
|
|
2015-09-23 14:30:20 +02:00
|
|
|
"""A class for outputting multiple files to a MHTML document.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
root_content: The root content as bytes.
|
|
|
|
content_location: The url of the page as str.
|
|
|
|
content_type: The MIME-type of the root content as str.
|
|
|
|
_files: Mapping of location->_File struct.
|
|
|
|
"""
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-24 12:52:10 +02:00
|
|
|
BOUNDARY = "---=_qute-" + str(uuid.uuid4())
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-24 13:30:47 +02:00
|
|
|
def __init__(self, root_content, content_location, content_type):
|
2015-09-19 20:06:15 +02:00
|
|
|
self.root_content = root_content
|
|
|
|
self.content_location = content_location
|
2015-09-20 17:11:28 +02:00
|
|
|
self.content_type = content_type
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
self._files = {}
|
|
|
|
|
|
|
|
def add_file(self, location, content, content_type=None,
|
2015-09-20 16:05:36 +02:00
|
|
|
transfer_encoding=E_QUOPRI):
|
2015-09-19 20:06:15 +02:00
|
|
|
"""Add a file to the given MHTML collection.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
location: The original location (URL) of the file.
|
|
|
|
content: The binary content of the file.
|
|
|
|
content_type: The MIME-type of the content (if available)
|
|
|
|
transfer_encoding: The transfer encoding to use for this file.
|
|
|
|
"""
|
|
|
|
self._files[location] = _File(
|
|
|
|
content=content, content_type=content_type,
|
|
|
|
content_location=location, transfer_encoding=transfer_encoding,
|
|
|
|
)
|
|
|
|
|
|
|
|
def remove_file(self, location):
|
|
|
|
"""Remove a file.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
location: The URL that identifies the file.
|
|
|
|
"""
|
|
|
|
del self._files[location]
|
|
|
|
|
|
|
|
def write_to(self, fp):
|
2015-09-20 17:11:28 +02:00
|
|
|
"""Output the MHTML file to the given file-like object.
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
Args:
|
|
|
|
fp: The file-object, openend in "wb" mode.
|
|
|
|
"""
|
2015-09-24 12:52:10 +02:00
|
|
|
msg = multipart.MIMEMultipart("related", self.BOUNDARY)
|
2015-09-23 18:10:14 +02:00
|
|
|
|
|
|
|
root = self._create_root_file()
|
|
|
|
msg.attach(root)
|
|
|
|
|
2015-09-19 20:06:15 +02:00
|
|
|
for file_data in self._files.values():
|
2015-09-23 18:10:14 +02:00
|
|
|
msg.attach(self._create_file(file_data))
|
|
|
|
|
2015-09-24 12:52:10 +02:00
|
|
|
gen = generator.BytesGenerator(fp, policy=MHTMLPolicy)
|
2015-09-23 18:10:14 +02:00
|
|
|
gen.flatten(msg)
|
|
|
|
|
|
|
|
def _create_root_file(self):
|
|
|
|
"""Return the root document as MIMEMultipart."""
|
2015-09-19 20:06:15 +02:00
|
|
|
root_file = _File(
|
|
|
|
content=self.root_content, content_type=self.content_type,
|
2015-09-20 14:30:12 +02:00
|
|
|
content_location=self.content_location, transfer_encoding=E_QUOPRI,
|
2015-09-19 20:06:15 +02:00
|
|
|
)
|
2015-09-23 18:10:14 +02:00
|
|
|
return self._create_file(root_file)
|
|
|
|
|
|
|
|
def _create_file(self, f):
|
|
|
|
"""Return the single given file as MIMEMultipart."""
|
2015-09-24 12:52:10 +02:00
|
|
|
msg = multipart.MIMEMultipart()
|
2015-09-23 18:10:14 +02:00
|
|
|
msg["Content-Location"] = f.content_location
|
|
|
|
# Get rid of the default type multipart/mixed
|
|
|
|
del msg["Content-Type"]
|
|
|
|
if f.content_type:
|
|
|
|
msg.set_type(f.content_type)
|
2015-09-24 14:07:12 +02:00
|
|
|
msg.set_payload(f.content)
|
|
|
|
f.transfer_encoding(msg)
|
2015-09-23 18:10:14 +02:00
|
|
|
return msg
|
2015-09-19 20:06:15 +02:00
|
|
|
|
|
|
|
|
2015-09-24 13:22:53 +02:00
|
|
|
class _Downloader():
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-23 14:30:20 +02:00
|
|
|
"""A class to download whole websites.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
web_view: The QWebView which contains the website that will be saved.
|
|
|
|
dest: Destination filename.
|
|
|
|
writer: The MHTMLWriter object which is used to save the page.
|
|
|
|
loaded_urls: A set of QUrls of finished asset downloads.
|
2015-09-23 16:18:41 +02:00
|
|
|
pending_downloads: A set of unfinished (url, DownloadItem) tuples.
|
2015-09-23 14:30:20 +02:00
|
|
|
_finished: A flag indicating if the file has already been written.
|
2015-09-24 13:22:53 +02:00
|
|
|
_used: A flag indicating if the downloader has already been used.
|
2015-09-23 14:30:20 +02:00
|
|
|
"""
|
2015-09-20 17:57:32 +02:00
|
|
|
|
|
|
|
def __init__(self, web_view, dest):
|
|
|
|
self.web_view = web_view
|
|
|
|
self.dest = dest
|
2015-09-24 13:30:47 +02:00
|
|
|
self.writer = None
|
2015-09-22 18:48:52 +02:00
|
|
|
self.loaded_urls = {web_view.url()}
|
2015-09-20 17:57:32 +02:00
|
|
|
self.pending_downloads = set()
|
2015-09-23 14:30:20 +02:00
|
|
|
self._finished = False
|
2015-09-24 13:22:53 +02:00
|
|
|
self._used = False
|
2015-09-20 17:57:32 +02:00
|
|
|
|
|
|
|
def run(self):
|
|
|
|
"""Download and save the page.
|
|
|
|
|
|
|
|
The object must not be reused, you should create a new one if
|
|
|
|
you want to download another page.
|
|
|
|
"""
|
2015-09-24 13:22:53 +02:00
|
|
|
if self._used:
|
|
|
|
raise ValueError("Downloader already used")
|
|
|
|
self._used = True
|
2015-09-22 18:52:44 +02:00
|
|
|
web_url = self.web_view.url()
|
2015-09-20 17:57:32 +02:00
|
|
|
web_frame = self.web_view.page().mainFrame()
|
|
|
|
|
2015-09-24 13:30:47 +02:00
|
|
|
self.writer = MHTMLWriter(
|
|
|
|
web_frame.toHtml().encode("utf-8"),
|
2015-09-24 13:48:58 +02:00
|
|
|
content_location=web_url.toString(),
|
2015-09-24 13:30:47 +02:00
|
|
|
# I've found no way of getting the content type of a QWebView, but
|
|
|
|
# since we're using .toHtml, it's probably safe to say that the
|
|
|
|
# content-type is HTML
|
2015-09-24 13:48:58 +02:00
|
|
|
content_type='text/html; charset="UTF-8"',
|
2015-09-24 13:30:47 +02:00
|
|
|
)
|
2015-09-20 17:57:32 +02:00
|
|
|
# Currently only downloading <link> (stylesheets), <script>
|
|
|
|
# (javascript) and <img> (image) elements.
|
|
|
|
elements = (web_frame.findAllElements("link") +
|
|
|
|
web_frame.findAllElements("script") +
|
|
|
|
web_frame.findAllElements("img"))
|
|
|
|
|
|
|
|
for element in elements:
|
2015-09-24 13:22:53 +02:00
|
|
|
element = webelem.WebElementWrapper(element)
|
|
|
|
if "src" in element:
|
|
|
|
element_url = element["src"]
|
|
|
|
elif "href" in element:
|
|
|
|
element_url = element["href"]
|
|
|
|
else:
|
2015-09-20 17:57:32 +02:00
|
|
|
# Might be a local <script> tag or something else
|
|
|
|
continue
|
2015-09-22 18:52:44 +02:00
|
|
|
absolute_url = web_url.resolved(QUrl(element_url))
|
2015-09-22 18:48:52 +02:00
|
|
|
self.fetch_url(absolute_url)
|
|
|
|
|
2015-09-23 16:18:41 +02:00
|
|
|
styles = web_frame.findAllElements("style")
|
|
|
|
for style in styles:
|
2015-09-24 13:22:53 +02:00
|
|
|
style = webelem.WebElementWrapper(style)
|
|
|
|
if "type" in style and style["type"] != "text/css":
|
2015-09-23 16:18:41 +02:00
|
|
|
continue
|
2015-09-24 13:22:53 +02:00
|
|
|
for element_url in _get_css_imports(str(style)):
|
2015-09-23 16:18:41 +02:00
|
|
|
self.fetch_url(web_url.resolved(QUrl(element_url)))
|
|
|
|
|
|
|
|
# Search for references in inline styles
|
|
|
|
for element in web_frame.findAllElements("*"):
|
2015-09-24 13:22:53 +02:00
|
|
|
element = webelem.WebElementWrapper(element)
|
|
|
|
if "style" not in element:
|
2015-09-23 16:18:41 +02:00
|
|
|
continue
|
2015-09-24 13:22:53 +02:00
|
|
|
style = element["style"]
|
2015-09-23 16:18:41 +02:00
|
|
|
for element_url in _get_css_imports(style):
|
|
|
|
self.fetch_url(web_url.resolved(QUrl(element_url)))
|
|
|
|
|
2015-09-23 14:30:20 +02:00
|
|
|
# Shortcut if no assets need to be downloaded, otherwise the file would
|
2015-09-23 16:18:41 +02:00
|
|
|
# never be saved. Also might happen if the downloads are fast enough to
|
|
|
|
# complete before connecting their finished signal.
|
|
|
|
self.collect_zombies()
|
|
|
|
if not self.pending_downloads and not self._finished:
|
2015-09-23 14:30:20 +02:00
|
|
|
self.finish_file()
|
|
|
|
|
2015-09-22 18:48:52 +02:00
|
|
|
def fetch_url(self, url):
|
|
|
|
"""Download the given url and add the file to the collection.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The file to download as QUrl.
|
|
|
|
"""
|
2015-09-23 16:18:41 +02:00
|
|
|
if url.scheme() == "data":
|
|
|
|
return
|
2015-09-22 18:48:52 +02:00
|
|
|
# Prevent loading an asset twice
|
|
|
|
if url in self.loaded_urls:
|
|
|
|
return
|
|
|
|
self.loaded_urls.add(url)
|
2015-09-20 17:57:32 +02:00
|
|
|
|
2015-09-22 18:48:52 +02:00
|
|
|
log.misc.debug("loading asset at %s", url)
|
2015-09-20 17:57:32 +02:00
|
|
|
|
2015-09-22 18:48:52 +02:00
|
|
|
download_manager = objreg.get("download-manager", scope="window",
|
|
|
|
window="current")
|
2015-09-23 16:18:41 +02:00
|
|
|
item = download_manager.get(url, fileobj=_NoCloseBytesIO(),
|
2015-09-22 18:48:52 +02:00
|
|
|
auto_remove=True)
|
2015-09-23 16:18:41 +02:00
|
|
|
self.pending_downloads.add((url, item))
|
2015-09-22 18:48:52 +02:00
|
|
|
item.finished.connect(
|
|
|
|
functools.partial(self.finished, url, item))
|
|
|
|
item.error.connect(
|
|
|
|
functools.partial(self.error, url, item))
|
|
|
|
item.cancelled.connect(
|
|
|
|
functools.partial(self.error, url, item))
|
2015-09-20 17:57:32 +02:00
|
|
|
|
|
|
|
def finished(self, url, item):
|
|
|
|
"""Callback when a single asset is downloaded.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The original url of the asset as QUrl.
|
|
|
|
item: The DownloadItem given by the DownloadManager
|
|
|
|
"""
|
2015-09-23 16:18:41 +02:00
|
|
|
self.pending_downloads.remove((url, item))
|
2015-09-19 20:06:15 +02:00
|
|
|
mime = item.raw_headers.get(b"Content-Type", b"")
|
|
|
|
mime = mime.decode("ascii", "ignore")
|
2015-09-22 18:48:52 +02:00
|
|
|
|
|
|
|
if mime.lower() == "text/css":
|
2015-09-24 13:48:58 +02:00
|
|
|
# We can't always assume that CSS files are UTF-8, but CSS files
|
|
|
|
# shouldn't contain many non-ASCII characters anyway (in most
|
|
|
|
# cases). Using "ignore" lets us decode the file even if it's
|
|
|
|
# invalid UTF-8 data.
|
|
|
|
# The file written to the MHTML file won't be modified by this
|
|
|
|
# decoding, since there we're taking the original bytestream.
|
|
|
|
try:
|
|
|
|
css_string = item.fileobj.getvalue().decode("utf-8")
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
log.misc.warning("Invalid UTF-8 data in %s", url)
|
|
|
|
css_string = item.fileobj.getvalue().decode("utf-8", "ignore")
|
|
|
|
import_urls = _get_css_imports(css_string)
|
2015-09-22 18:48:52 +02:00
|
|
|
for import_url in import_urls:
|
2015-09-22 18:52:44 +02:00
|
|
|
absolute_url = url.resolved(QUrl(import_url))
|
2015-09-22 18:48:52 +02:00
|
|
|
self.fetch_url(absolute_url)
|
|
|
|
|
2015-09-20 14:30:12 +02:00
|
|
|
encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
|
2015-09-20 17:57:32 +02:00
|
|
|
self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
|
|
|
|
encode)
|
2015-09-23 16:18:41 +02:00
|
|
|
item.fileobj.actual_close()
|
2015-09-20 17:57:32 +02:00
|
|
|
if self.pending_downloads:
|
2015-09-19 20:06:15 +02:00
|
|
|
return
|
2015-09-20 17:57:32 +02:00
|
|
|
self.finish_file()
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-20 17:57:32 +02:00
|
|
|
def error(self, url, item, *_args):
|
2015-09-20 18:34:45 +02:00
|
|
|
"""Callback when a download error occurred.
|
2015-09-20 17:57:32 +02:00
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The orignal url of the asset as QUrl.
|
|
|
|
item: The DownloadItem given by the DownloadManager.
|
|
|
|
"""
|
2015-09-23 16:18:41 +02:00
|
|
|
try:
|
|
|
|
self.pending_downloads.remove((url, item))
|
|
|
|
except KeyError:
|
|
|
|
# This might happen if .collect_zombies() calls .finished() and the
|
|
|
|
# error handler will be called after .collect_zombies
|
|
|
|
log.misc.debug("Oops! Download already gone: %s", item)
|
|
|
|
return
|
|
|
|
item.fileobj.actual_close()
|
2015-09-20 17:57:32 +02:00
|
|
|
self.writer.add_file(url.toString(), b"")
|
|
|
|
if self.pending_downloads:
|
2015-09-19 20:06:15 +02:00
|
|
|
return
|
2015-09-20 17:57:32 +02:00
|
|
|
self.finish_file()
|
2015-09-19 20:06:15 +02:00
|
|
|
|
2015-09-20 17:57:32 +02:00
|
|
|
def finish_file(self):
|
|
|
|
"""Save the file to the filename given in __init__."""
|
2015-09-23 14:30:20 +02:00
|
|
|
if self._finished:
|
2015-09-23 16:18:41 +02:00
|
|
|
log.misc.debug("finish_file called twice, ignored!")
|
2015-09-23 14:30:20 +02:00
|
|
|
return
|
|
|
|
self._finished = True
|
2015-09-19 20:06:15 +02:00
|
|
|
log.misc.debug("All assets downloaded, ready to finish off!")
|
2015-09-20 17:57:32 +02:00
|
|
|
with open(self.dest, "wb") as file_output:
|
|
|
|
self.writer.write_to(file_output)
|
2015-09-24 13:22:53 +02:00
|
|
|
message.info("current", "Page saved as {}".format(self.dest))
|
2015-09-20 17:57:32 +02:00
|
|
|
|
2015-09-23 16:18:41 +02:00
|
|
|
def collect_zombies(self):
|
|
|
|
"""Collect done downloads and add their data to the MHTML file.
|
|
|
|
|
|
|
|
This is needed if a download finishes before attaching its
|
|
|
|
finished signal.
|
|
|
|
"""
|
|
|
|
items = set((url, item) for url, item in self.pending_downloads
|
|
|
|
if item.done)
|
|
|
|
log.misc.debug("Zombie downloads: %s", items)
|
|
|
|
for url, item in items:
|
|
|
|
self.finished(url, item)
|
|
|
|
|
|
|
|
|
2015-09-23 17:18:26 +02:00
|
|
|
class _NoCloseBytesIO(io.BytesIO): # pylint: disable=no-init
|
2015-09-23 16:18:41 +02:00
|
|
|
|
2015-09-23 17:18:26 +02:00
|
|
|
"""BytesIO that can't be .closed().
|
2015-09-23 16:18:41 +02:00
|
|
|
|
2015-09-23 17:18:26 +02:00
|
|
|
This is needed to prevent the DownloadManager from closing the stream, thus
|
2015-09-23 16:18:41 +02:00
|
|
|
discarding the data.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
"""Do nothing."""
|
|
|
|
pass
|
|
|
|
|
|
|
|
def actual_close(self):
|
|
|
|
"""Close the stream."""
|
|
|
|
super().close()
|
|
|
|
|
2015-09-20 17:57:32 +02:00
|
|
|
|
|
|
|
def start_download(dest):
|
|
|
|
"""Start downloading the current page and all assets to a MHTML file.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
dest: The filename where the resulting file should be saved.
|
|
|
|
"""
|
2015-09-22 13:37:03 +02:00
|
|
|
dest = os.path.expanduser(dest)
|
2015-09-20 17:57:32 +02:00
|
|
|
web_view = objreg.get("webview", scope="tab", tab="current")
|
|
|
|
loader = _Downloader(web_view, dest)
|
|
|
|
loader.run()
|