diff --git a/README.asciidoc b/README.asciidoc
index 8204391ae..fb224ac04 100644
--- a/README.asciidoc
+++ b/README.asciidoc
@@ -100,6 +100,9 @@ The following software and libraries are required to run qutebrowser:
* http://pygments.org/[pygments]
* http://pyyaml.org/wiki/PyYAML[PyYAML]
+The following libraries are optional and provide a better user experience:
+* http://cthedot.de/cssutils/[cssutils]
+
To generate the documentation for the `:help` command, when using the git
repository (rather than a release), http://asciidoc.org/[asciidoc] is needed.
diff --git a/doc/help/commands.asciidoc b/doc/help/commands.asciidoc
index 98081479d..bdd881bb1 100644
--- a/doc/help/commands.asciidoc
+++ b/doc/help/commands.asciidoc
@@ -145,13 +145,19 @@ Close the current window.
[[download]]
=== download
-Syntax: +:download ['url'] ['dest']+
+Syntax: +:download [*--mhtml*] [*--dest* 'DEST'] ['url'] ['dest-old']+
Download a given URL, or current page if no URL given.
+The form `:download [url] [dest]` is deprecated, use `:download --dest [dest] [url]` instead.
+
==== positional arguments
* +'url'+: The URL to download. If not given, download the current page.
-* +'dest'+: The file path to write the download to, or not given to ask.
+* +'dest-old'+: (deprecated) Same as dest.
+
+==== optional arguments
+* +*-m*+, +*--mhtml*+: Download the current page and all assets as mhtml file.
+* +*-d*+, +*--dest*+: The file path to write the download to, or not given to ask.
[[download-cancel]]
=== download-cancel
diff --git a/qutebrowser/browser/commands.py b/qutebrowser/browser/commands.py
index ae4b1da39..4e4527cae 100644
--- a/qutebrowser/browser/commands.py
+++ b/qutebrowser/browser/commands.py
@@ -37,7 +37,7 @@ import pygments.formatters
from qutebrowser.commands import userscripts, cmdexc, cmdutils, runners
from qutebrowser.config import config, configexc
-from qutebrowser.browser import webelem, inspector, urlmarks
+from qutebrowser.browser import webelem, inspector, urlmarks, downloads, mhtml
from qutebrowser.keyinput import modeman
from qutebrowser.utils import (message, usertypes, log, qtutils, urlutils,
objreg, utils)
@@ -1140,22 +1140,68 @@ class CommandDispatcher:
cur.inspector.show()
@cmdutils.register(instance='command-dispatcher', scope='window')
- def download(self, url=None, dest=None):
+ def download(self, url=None, dest_old=None, *, mhtml_=False, dest=None):
"""Download a given URL, or current page if no URL given.
+ The form `:download [url] [dest]` is deprecated, use `:download --dest
+ [dest] [url]` instead.
+
Args:
url: The URL to download. If not given, download the current page.
+ dest_old: (deprecated) Same as dest.
dest: The file path to write the download to, or None to ask.
+ mhtml_: Download the current page and all assets as mhtml file.
"""
+ if dest_old is not None:
+ message.warning(
+ self._win_id, ":download [url] [dest] is deprecated - use"
+ " download --dest [dest] [url]")
+ if dest is not None:
+ raise cmdexc.CommandError("Can't give two destinations for the"
+ " download.")
+ dest = dest_old
+
download_manager = objreg.get('download-manager', scope='window',
window=self._win_id)
if url:
+ if mhtml_:
+ raise cmdexc.CommandError("Can only download the current page"
+ " as mhtml.")
url = urlutils.qurl_from_user_input(url)
urlutils.raise_cmdexc_if_invalid(url)
download_manager.get(url, filename=dest)
else:
- page = self._current_widget().page()
- download_manager.get(self._current_url(), page=page)
+ if mhtml_:
+ self._download_mhtml(dest)
+ else:
+ page = self._current_widget().page()
+ download_manager.get(self._current_url(), page=page,
+ filename=dest)
+
+ def _download_mhtml(self, dest=None):
+ """Download the current page as a MHTML file, including all assets.
+
+ Args:
+ dest: The file path to write the download to.
+ """
+ tab_id = self._current_index()
+ if dest is None:
+ suggested_fn = self._current_title() + ".mht"
+ suggested_fn = utils.sanitize_filename(suggested_fn)
+ q = usertypes.Question()
+ q.text = "Save page to: "
+ q.mode = usertypes.PromptMode.text
+ q.completed.connect(q.deleteLater)
+ q.default = downloads.path_suggestion(suggested_fn)
+ q.answered.connect(functools.partial(
+ mhtml.start_download_checked, win_id=self._win_id,
+ tab_id=tab_id))
+ message_bridge = objreg.get("message-bridge", scope="window",
+ window=self._win_id)
+ message_bridge.ask(q, blocking=False)
+ else:
+ mhtml.start_download_checked(dest, win_id=self._win_id,
+ tab_id=tab_id)
@cmdutils.register(instance='command-dispatcher', scope='window',
deprecated="Use :download instead.")
diff --git a/qutebrowser/browser/downloads.py b/qutebrowser/browser/downloads.py
index e87b3e3a2..4bea68fa8 100644
--- a/qutebrowser/browser/downloads.py
+++ b/qutebrowser/browser/downloads.py
@@ -49,7 +49,7 @@ ModelRole = usertypes.enum('ModelRole', ['item'], start=Qt.UserRole,
RetryInfo = collections.namedtuple('RetryInfo', ['request', 'manager'])
# Remember the last used directory
-_last_used_directory = None
+last_used_directory = None
# All REFRESH_INTERVAL milliseconds, speeds will be recalculated and downloads
@@ -57,20 +57,20 @@ _last_used_directory = None
REFRESH_INTERVAL = 500
-def _download_dir():
+def download_dir():
"""Get the download directory to use."""
directory = config.get('storage', 'download-directory')
remember_dir = config.get('storage', 'remember-download-directory')
- if remember_dir and _last_used_directory is not None:
- return _last_used_directory
+ if remember_dir and last_used_directory is not None:
+ return last_used_directory
elif directory is None:
return standarddir.download()
else:
return directory
-def _path_suggestion(filename):
+def path_suggestion(filename):
"""Get the suggested file path.
Args:
@@ -79,15 +79,36 @@ def _path_suggestion(filename):
suggestion = config.get('completion', 'download-path-suggestion')
if suggestion == 'path':
# add trailing '/' if not present
- return os.path.join(_download_dir(), '')
+ return os.path.join(download_dir(), '')
elif suggestion == 'filename':
return filename
elif suggestion == 'both':
- return os.path.join(_download_dir(), filename)
+ return os.path.join(download_dir(), filename)
else:
raise ValueError("Invalid suggestion value {}!".format(suggestion))
+def create_full_filename(basename, filename):
+ """Create a full filename based on the given basename and filename.
+
+ Args:
+ basename: The basename to use if filename is a directory.
+ filename: The path to a folder or file where you want to save.
+
+ Return:
+ The full absolute path, or None if filename creation was not possible.
+ """
+ if os.path.isabs(filename) and os.path.isdir(filename):
+ # We got an absolute directory from the user, so we save it under
+ # the default filename in that directory.
+ return os.path.join(filename, basename)
+ elif os.path.isabs(filename):
+ # We got an absolute filename from the user, so we save it under
+ # that filename.
+ return filename
+ return None
+
+
class DownloadItemStats(QObject):
"""Statistics (bytes done, total bytes, time, etc.) about a download.
@@ -201,6 +222,7 @@ class DownloadItem(QObject):
fileobj: The file object to download the file to.
reply: The QNetworkReply associated with this download.
retry_info: A RetryInfo instance.
+ raw_headers: The headers sent by the server.
_filename: The filename of the download.
_redirects: How many time we were redirected already.
_buffer: A BytesIO object to buffer incoming data until we know the
@@ -255,6 +277,7 @@ class DownloadItem(QObject):
self._filename = None
self.init_reply(reply)
self._win_id = win_id
+ self.raw_headers = {}
def __repr__(self):
return utils.get_repr(self, basename=self.basename)
@@ -354,6 +377,7 @@ class DownloadItem(QObject):
reply.finished.connect(self.on_reply_finished)
reply.error.connect(self.on_reply_error)
reply.readyRead.connect(self.on_ready_read)
+ reply.metaDataChanged.connect(self.on_meta_data_changed)
self.retry_info = RetryInfo(request=reply.request(),
manager=reply.manager())
if not self.fileobj:
@@ -444,7 +468,7 @@ class DownloadItem(QObject):
filename: The full filename to save the download to.
None: special value to stop the download.
"""
- global _last_used_directory
+ global last_used_directory
if self.fileobj is not None:
raise ValueError("fileobj was already set! filename: {}, "
"existing: {}, fileobj {}".format(
@@ -454,13 +478,16 @@ class DownloadItem(QObject):
# See https://github.com/The-Compiler/qutebrowser/issues/427
encoding = sys.getfilesystemencoding()
filename = utils.force_encoding(filename, encoding)
- if not self._create_full_filename(filename):
+ self._filename = create_full_filename(self.basename, filename)
+ if self._filename is None:
# We only got a filename (without directory) or a relative path
# from the user, so we append that to the default directory and
# try again.
- self._create_full_filename(os.path.join(_download_dir(), filename))
+ self._filename = create_full_filename(
+ self.basename, os.path.join(download_dir(), filename))
- _last_used_directory = os.path.dirname(self._filename)
+ self.basename = os.path.basename(self._filename)
+ last_used_directory = os.path.dirname(self._filename)
log.downloads.debug("Setting filename to {}".format(filename))
if os.path.isfile(self._filename):
@@ -477,25 +504,6 @@ class DownloadItem(QObject):
else:
self._create_fileobj()
- def _create_full_filename(self, filename):
- """Try to create the full filename.
-
- Return:
- True if the full filename was created, False otherwise.
- """
- if os.path.isabs(filename) and os.path.isdir(filename):
- # We got an absolute directory from the user, so we save it under
- # the default filename in that directory.
- self._filename = os.path.join(filename, self.basename)
- return True
- elif os.path.isabs(filename):
- # We got an absolute filename from the user, so we save it under
- # that filename.
- self._filename = filename
- self.basename = os.path.basename(self._filename)
- return True
- return False
-
def set_fileobj(self, fileobj):
""""Set the file object to write the download to.
@@ -593,6 +601,15 @@ class DownloadItem(QObject):
if data is not None:
self._buffer.write(data)
+ @pyqtSlot()
+ def on_meta_data_changed(self):
+ """Update the download's metadata."""
+ if self.reply is None:
+ return
+ self.raw_headers = {}
+ for key, value in self.reply.rawHeaderPairs():
+ self.raw_headers[bytes(key)] = bytes(value)
+
def _handle_redirect(self):
"""Handle a HTTP redirect.
@@ -720,7 +737,7 @@ class DownloadManager(QAbstractListModel):
prompt_download_directory = config.get(
'storage', 'prompt-download-directory')
if not prompt_download_directory and not fileobj:
- filename = _download_dir()
+ filename = download_dir()
if fileobj is not None or filename is not None:
return self.fetch_request(request,
@@ -735,7 +752,7 @@ class DownloadManager(QAbstractListModel):
suggested_fn = utils.force_encoding(suggested_fn, encoding)
q = self._prepare_question()
- q.default = _path_suggestion(suggested_fn)
+ q.default = path_suggestion(suggested_fn)
message_bridge = objreg.get('message-bridge', scope='window',
window=self._win_id)
q.answered.connect(
@@ -820,7 +837,7 @@ class DownloadManager(QAbstractListModel):
prompt_download_directory = config.get('storage',
'prompt-download-directory')
if not prompt_download_directory and not fileobj:
- filename = _download_dir()
+ filename = download_dir()
if filename is not None:
download.set_filename(filename)
@@ -829,7 +846,7 @@ class DownloadManager(QAbstractListModel):
download.autoclose = False
else:
q = self._prepare_question()
- q.default = _path_suggestion(suggested_filename)
+ q.default = path_suggestion(suggested_filename)
q.answered.connect(download.set_filename)
q.cancelled.connect(download.cancel)
download.cancelled.connect(q.abort)
diff --git a/qutebrowser/browser/mhtml.py b/qutebrowser/browser/mhtml.py
new file mode 100644
index 000000000..7381f2d06
--- /dev/null
+++ b/qutebrowser/browser/mhtml.py
@@ -0,0 +1,511 @@
+# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
+
+# Copyright 2015 Daniel Schadt
+#
+# This file is part of qutebrowser.
+#
+# qutebrowser is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# qutebrowser is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with qutebrowser. If not, see .
+
+"""Utils for writing a MHTML file."""
+
+import functools
+import io
+import os
+import re
+import sys
+import collections
+import uuid
+import email.policy
+import email.generator
+import email.encoders
+import email.mime.multipart
+
+from PyQt5.QtCore import QUrl
+
+from qutebrowser.browser import webelem, downloads
+from qutebrowser.utils import log, objreg, message, usertypes, utils, urlutils
+
+try:
+ import cssutils
+except (ImportError, re.error):
+ # Catching re.error because cssutils in earlier releases (<= 1.0) is broken
+ # on Python 3.5
+ # See https://bitbucket.org/cthedot/cssutils/issues/52
+ cssutils = None
+
+_File = collections.namedtuple('_File',
+ ['content', 'content_type', 'content_location',
+ 'transfer_encoding'])
+
+
+_CSS_URL_PATTERNS = [re.compile(x) for x in [
+ r"@import\s+'(?P[^']+)'",
+ r'@import\s+"(?P[^"]+)"',
+ r'''url\((?P[^'"][^)]*)\)''',
+ r'url\("(?P[^"]+)"\)',
+ r"url\('(?P[^']+)'\)",
+]]
+
+
+def _get_css_imports_regex(data):
+ """Return all assets that are referenced in the given CSS document.
+
+ The returned URLs are relative to the stylesheet's URL.
+
+ Args:
+ data: The content of the stylesheet to scan as string.
+ """
+ urls = []
+ for pattern in _CSS_URL_PATTERNS:
+ for match in pattern.finditer(data):
+ url = match.group("url")
+ if url:
+ urls.append(url)
+ return urls
+
+
+def _get_css_imports_cssutils(data, inline=False):
+ """Return all assets that are referenced in the given CSS document.
+
+ The returned URLs are relative to the stylesheet's URL.
+
+ Args:
+ data: The content of the stylesheet to scan as string.
+ inline: True if the argument is a inline HTML style attribute.
+ """
+ # We don't care about invalid CSS data, this will only litter the log
+ # output with CSS errors
+ parser = cssutils.CSSParser(loglevel=100,
+ fetcher=lambda url: (None, ""), validate=False)
+ if not inline:
+ sheet = parser.parseString(data)
+ return list(cssutils.getUrls(sheet))
+ else:
+ urls = []
+ declaration = parser.parseStyle(data)
+ # prop = background, color, margin, ...
+ for prop in declaration:
+ # value = red, 10px, url(foobar), ...
+ for value in prop.propertyValue:
+ if isinstance(value, cssutils.css.URIValue):
+ if value.uri:
+ urls.append(value.uri)
+ return urls
+
+
+def _get_css_imports(data, inline=False):
+ """Return all assets that are referenced in the given CSS document.
+
+ The returned URLs are relative to the stylesheet's URL.
+
+ Args:
+ data: The content of the stylesheet to scan as string.
+ inline: True if the argument is a inline HTML style attribute.
+ """
+ if cssutils is None:
+ return _get_css_imports_regex(data)
+ else:
+ return _get_css_imports_cssutils(data, inline)
+
+
+def _check_rel(element):
+ """Return true if the element's rel attribute fits our criteria.
+
+ rel has to contain 'stylesheet' or 'icon'. Also returns True if the rel
+ attribute is unset.
+
+ Args:
+ element: The WebElementWrapper which should be checked.
+ """
+ if 'rel' not in element:
+ return True
+ must_have = {'stylesheet', 'icon'}
+ rels = [rel.lower() for rel in element['rel'].split(' ')]
+ return any(rel in rels for rel in must_have)
+
+
+MHTMLPolicy = email.policy.default.clone(linesep='\r\n', max_line_length=0)
+
+
+# Encode the file using base64 encoding.
+E_BASE64 = email.encoders.encode_base64
+
+
+# Encode the file using MIME quoted-printable encoding.
+E_QUOPRI = email.encoders.encode_quopri
+
+
+class MHTMLWriter():
+
+ """A class for outputting multiple files to a MHTML document.
+
+ Attributes:
+ root_content: The root content as bytes.
+ content_location: The url of the page as str.
+ content_type: The MIME-type of the root content as str.
+ _files: Mapping of location->_File namedtuple.
+ """
+
+ def __init__(self, root_content, content_location, content_type):
+ self.root_content = root_content
+ self.content_location = content_location
+ self.content_type = content_type
+ self._files = {}
+
+ def add_file(self, location, content, content_type=None,
+ transfer_encoding=E_QUOPRI):
+ """Add a file to the given MHTML collection.
+
+ Args:
+ location: The original location (URL) of the file.
+ content: The binary content of the file.
+ content_type: The MIME-type of the content (if available)
+ transfer_encoding: The transfer encoding to use for this file.
+ """
+ self._files[location] = _File(
+ content=content, content_type=content_type,
+ content_location=location, transfer_encoding=transfer_encoding,
+ )
+
+ def write_to(self, fp):
+ """Output the MHTML file to the given file-like object.
+
+ Args:
+ fp: The file-object, opened in "wb" mode.
+ """
+ msg = email.mime.multipart.MIMEMultipart(
+ 'related', '---=_qute-{}'.format(uuid.uuid4()))
+
+ root = self._create_root_file()
+ msg.attach(root)
+
+ for _, file_data in sorted(self._files.items()):
+ msg.attach(self._create_file(file_data))
+
+ gen = email.generator.BytesGenerator(fp, policy=MHTMLPolicy)
+ gen.flatten(msg)
+
+ def _create_root_file(self):
+ """Return the root document as MIMEMultipart."""
+ root_file = _File(
+ content=self.root_content, content_type=self.content_type,
+ content_location=self.content_location, transfer_encoding=E_QUOPRI,
+ )
+ return self._create_file(root_file)
+
+ def _create_file(self, f):
+ """Return the single given file as MIMEMultipart."""
+ msg = email.mime.multipart.MIMEMultipart()
+ msg['Content-Location'] = f.content_location
+ # Get rid of the default type multipart/mixed
+ del msg['Content-Type']
+ if f.content_type:
+ msg.set_type(f.content_type)
+ msg.set_payload(f.content)
+ f.transfer_encoding(msg)
+ return msg
+
+
+class _Downloader():
+
+ """A class to download whole websites.
+
+ Attributes:
+ web_view: The QWebView which contains the website that will be saved.
+ dest: Destination filename.
+ writer: The MHTMLWriter object which is used to save the page.
+ loaded_urls: A set of QUrls of finished asset downloads.
+ pending_downloads: A set of unfinished (url, DownloadItem) tuples.
+ _finished: A flag indicating if the file has already been written.
+ _used: A flag indicating if the downloader has already been used.
+ """
+
+ def __init__(self, web_view, dest):
+ self.web_view = web_view
+ self.dest = dest
+ self.writer = None
+ self.loaded_urls = {web_view.url()}
+ self.pending_downloads = set()
+ self._finished = False
+ self._used = False
+
+ def run(self):
+ """Download and save the page.
+
+ The object must not be reused, you should create a new one if
+ you want to download another page.
+ """
+ if self._used:
+ raise ValueError("Downloader already used")
+ self._used = True
+ web_url = self.web_view.url()
+ web_frame = self.web_view.page().mainFrame()
+
+ self.writer = MHTMLWriter(
+ web_frame.toHtml().encode('utf-8'),
+ content_location=urlutils.encoded_url(web_url),
+ # I've found no way of getting the content type of a QWebView, but
+ # since we're using .toHtml, it's probably safe to say that the
+ # content-type is HTML
+ content_type='text/html; charset="UTF-8"',
+ )
+ # Currently only downloading (stylesheets),