diff --git a/README.asciidoc b/README.asciidoc index 8204391ae..fb224ac04 100644 --- a/README.asciidoc +++ b/README.asciidoc @@ -100,6 +100,9 @@ The following software and libraries are required to run qutebrowser: * http://pygments.org/[pygments] * http://pyyaml.org/wiki/PyYAML[PyYAML] +The following libraries are optional and provide a better user experience: +* http://cthedot.de/cssutils/[cssutils] + To generate the documentation for the `:help` command, when using the git repository (rather than a release), http://asciidoc.org/[asciidoc] is needed. diff --git a/doc/help/commands.asciidoc b/doc/help/commands.asciidoc index 98081479d..bdd881bb1 100644 --- a/doc/help/commands.asciidoc +++ b/doc/help/commands.asciidoc @@ -145,13 +145,19 @@ Close the current window. [[download]] === download -Syntax: +:download ['url'] ['dest']+ +Syntax: +:download [*--mhtml*] [*--dest* 'DEST'] ['url'] ['dest-old']+ Download a given URL, or current page if no URL given. +The form `:download [url] [dest]` is deprecated, use `:download --dest [dest] [url]` instead. + ==== positional arguments * +'url'+: The URL to download. If not given, download the current page. -* +'dest'+: The file path to write the download to, or not given to ask. +* +'dest-old'+: (deprecated) Same as dest. + +==== optional arguments +* +*-m*+, +*--mhtml*+: Download the current page and all assets as mhtml file. +* +*-d*+, +*--dest*+: The file path to write the download to, or not given to ask. [[download-cancel]] === download-cancel diff --git a/qutebrowser/browser/commands.py b/qutebrowser/browser/commands.py index ae4b1da39..4e4527cae 100644 --- a/qutebrowser/browser/commands.py +++ b/qutebrowser/browser/commands.py @@ -37,7 +37,7 @@ import pygments.formatters from qutebrowser.commands import userscripts, cmdexc, cmdutils, runners from qutebrowser.config import config, configexc -from qutebrowser.browser import webelem, inspector, urlmarks +from qutebrowser.browser import webelem, inspector, urlmarks, downloads, mhtml from qutebrowser.keyinput import modeman from qutebrowser.utils import (message, usertypes, log, qtutils, urlutils, objreg, utils) @@ -1140,22 +1140,68 @@ class CommandDispatcher: cur.inspector.show() @cmdutils.register(instance='command-dispatcher', scope='window') - def download(self, url=None, dest=None): + def download(self, url=None, dest_old=None, *, mhtml_=False, dest=None): """Download a given URL, or current page if no URL given. + The form `:download [url] [dest]` is deprecated, use `:download --dest + [dest] [url]` instead. + Args: url: The URL to download. If not given, download the current page. + dest_old: (deprecated) Same as dest. dest: The file path to write the download to, or None to ask. + mhtml_: Download the current page and all assets as mhtml file. """ + if dest_old is not None: + message.warning( + self._win_id, ":download [url] [dest] is deprecated - use" + " download --dest [dest] [url]") + if dest is not None: + raise cmdexc.CommandError("Can't give two destinations for the" + " download.") + dest = dest_old + download_manager = objreg.get('download-manager', scope='window', window=self._win_id) if url: + if mhtml_: + raise cmdexc.CommandError("Can only download the current page" + " as mhtml.") url = urlutils.qurl_from_user_input(url) urlutils.raise_cmdexc_if_invalid(url) download_manager.get(url, filename=dest) else: - page = self._current_widget().page() - download_manager.get(self._current_url(), page=page) + if mhtml_: + self._download_mhtml(dest) + else: + page = self._current_widget().page() + download_manager.get(self._current_url(), page=page, + filename=dest) + + def _download_mhtml(self, dest=None): + """Download the current page as a MHTML file, including all assets. + + Args: + dest: The file path to write the download to. + """ + tab_id = self._current_index() + if dest is None: + suggested_fn = self._current_title() + ".mht" + suggested_fn = utils.sanitize_filename(suggested_fn) + q = usertypes.Question() + q.text = "Save page to: " + q.mode = usertypes.PromptMode.text + q.completed.connect(q.deleteLater) + q.default = downloads.path_suggestion(suggested_fn) + q.answered.connect(functools.partial( + mhtml.start_download_checked, win_id=self._win_id, + tab_id=tab_id)) + message_bridge = objreg.get("message-bridge", scope="window", + window=self._win_id) + message_bridge.ask(q, blocking=False) + else: + mhtml.start_download_checked(dest, win_id=self._win_id, + tab_id=tab_id) @cmdutils.register(instance='command-dispatcher', scope='window', deprecated="Use :download instead.") diff --git a/qutebrowser/browser/downloads.py b/qutebrowser/browser/downloads.py index e87b3e3a2..4bea68fa8 100644 --- a/qutebrowser/browser/downloads.py +++ b/qutebrowser/browser/downloads.py @@ -49,7 +49,7 @@ ModelRole = usertypes.enum('ModelRole', ['item'], start=Qt.UserRole, RetryInfo = collections.namedtuple('RetryInfo', ['request', 'manager']) # Remember the last used directory -_last_used_directory = None +last_used_directory = None # All REFRESH_INTERVAL milliseconds, speeds will be recalculated and downloads @@ -57,20 +57,20 @@ _last_used_directory = None REFRESH_INTERVAL = 500 -def _download_dir(): +def download_dir(): """Get the download directory to use.""" directory = config.get('storage', 'download-directory') remember_dir = config.get('storage', 'remember-download-directory') - if remember_dir and _last_used_directory is not None: - return _last_used_directory + if remember_dir and last_used_directory is not None: + return last_used_directory elif directory is None: return standarddir.download() else: return directory -def _path_suggestion(filename): +def path_suggestion(filename): """Get the suggested file path. Args: @@ -79,15 +79,36 @@ def _path_suggestion(filename): suggestion = config.get('completion', 'download-path-suggestion') if suggestion == 'path': # add trailing '/' if not present - return os.path.join(_download_dir(), '') + return os.path.join(download_dir(), '') elif suggestion == 'filename': return filename elif suggestion == 'both': - return os.path.join(_download_dir(), filename) + return os.path.join(download_dir(), filename) else: raise ValueError("Invalid suggestion value {}!".format(suggestion)) +def create_full_filename(basename, filename): + """Create a full filename based on the given basename and filename. + + Args: + basename: The basename to use if filename is a directory. + filename: The path to a folder or file where you want to save. + + Return: + The full absolute path, or None if filename creation was not possible. + """ + if os.path.isabs(filename) and os.path.isdir(filename): + # We got an absolute directory from the user, so we save it under + # the default filename in that directory. + return os.path.join(filename, basename) + elif os.path.isabs(filename): + # We got an absolute filename from the user, so we save it under + # that filename. + return filename + return None + + class DownloadItemStats(QObject): """Statistics (bytes done, total bytes, time, etc.) about a download. @@ -201,6 +222,7 @@ class DownloadItem(QObject): fileobj: The file object to download the file to. reply: The QNetworkReply associated with this download. retry_info: A RetryInfo instance. + raw_headers: The headers sent by the server. _filename: The filename of the download. _redirects: How many time we were redirected already. _buffer: A BytesIO object to buffer incoming data until we know the @@ -255,6 +277,7 @@ class DownloadItem(QObject): self._filename = None self.init_reply(reply) self._win_id = win_id + self.raw_headers = {} def __repr__(self): return utils.get_repr(self, basename=self.basename) @@ -354,6 +377,7 @@ class DownloadItem(QObject): reply.finished.connect(self.on_reply_finished) reply.error.connect(self.on_reply_error) reply.readyRead.connect(self.on_ready_read) + reply.metaDataChanged.connect(self.on_meta_data_changed) self.retry_info = RetryInfo(request=reply.request(), manager=reply.manager()) if not self.fileobj: @@ -444,7 +468,7 @@ class DownloadItem(QObject): filename: The full filename to save the download to. None: special value to stop the download. """ - global _last_used_directory + global last_used_directory if self.fileobj is not None: raise ValueError("fileobj was already set! filename: {}, " "existing: {}, fileobj {}".format( @@ -454,13 +478,16 @@ class DownloadItem(QObject): # See https://github.com/The-Compiler/qutebrowser/issues/427 encoding = sys.getfilesystemencoding() filename = utils.force_encoding(filename, encoding) - if not self._create_full_filename(filename): + self._filename = create_full_filename(self.basename, filename) + if self._filename is None: # We only got a filename (without directory) or a relative path # from the user, so we append that to the default directory and # try again. - self._create_full_filename(os.path.join(_download_dir(), filename)) + self._filename = create_full_filename( + self.basename, os.path.join(download_dir(), filename)) - _last_used_directory = os.path.dirname(self._filename) + self.basename = os.path.basename(self._filename) + last_used_directory = os.path.dirname(self._filename) log.downloads.debug("Setting filename to {}".format(filename)) if os.path.isfile(self._filename): @@ -477,25 +504,6 @@ class DownloadItem(QObject): else: self._create_fileobj() - def _create_full_filename(self, filename): - """Try to create the full filename. - - Return: - True if the full filename was created, False otherwise. - """ - if os.path.isabs(filename) and os.path.isdir(filename): - # We got an absolute directory from the user, so we save it under - # the default filename in that directory. - self._filename = os.path.join(filename, self.basename) - return True - elif os.path.isabs(filename): - # We got an absolute filename from the user, so we save it under - # that filename. - self._filename = filename - self.basename = os.path.basename(self._filename) - return True - return False - def set_fileobj(self, fileobj): """"Set the file object to write the download to. @@ -593,6 +601,15 @@ class DownloadItem(QObject): if data is not None: self._buffer.write(data) + @pyqtSlot() + def on_meta_data_changed(self): + """Update the download's metadata.""" + if self.reply is None: + return + self.raw_headers = {} + for key, value in self.reply.rawHeaderPairs(): + self.raw_headers[bytes(key)] = bytes(value) + def _handle_redirect(self): """Handle a HTTP redirect. @@ -720,7 +737,7 @@ class DownloadManager(QAbstractListModel): prompt_download_directory = config.get( 'storage', 'prompt-download-directory') if not prompt_download_directory and not fileobj: - filename = _download_dir() + filename = download_dir() if fileobj is not None or filename is not None: return self.fetch_request(request, @@ -735,7 +752,7 @@ class DownloadManager(QAbstractListModel): suggested_fn = utils.force_encoding(suggested_fn, encoding) q = self._prepare_question() - q.default = _path_suggestion(suggested_fn) + q.default = path_suggestion(suggested_fn) message_bridge = objreg.get('message-bridge', scope='window', window=self._win_id) q.answered.connect( @@ -820,7 +837,7 @@ class DownloadManager(QAbstractListModel): prompt_download_directory = config.get('storage', 'prompt-download-directory') if not prompt_download_directory and not fileobj: - filename = _download_dir() + filename = download_dir() if filename is not None: download.set_filename(filename) @@ -829,7 +846,7 @@ class DownloadManager(QAbstractListModel): download.autoclose = False else: q = self._prepare_question() - q.default = _path_suggestion(suggested_filename) + q.default = path_suggestion(suggested_filename) q.answered.connect(download.set_filename) q.cancelled.connect(download.cancel) download.cancelled.connect(q.abort) diff --git a/qutebrowser/browser/mhtml.py b/qutebrowser/browser/mhtml.py new file mode 100644 index 000000000..7381f2d06 --- /dev/null +++ b/qutebrowser/browser/mhtml.py @@ -0,0 +1,511 @@ +# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: + +# Copyright 2015 Daniel Schadt +# +# This file is part of qutebrowser. +# +# qutebrowser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# qutebrowser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with qutebrowser. If not, see . + +"""Utils for writing a MHTML file.""" + +import functools +import io +import os +import re +import sys +import collections +import uuid +import email.policy +import email.generator +import email.encoders +import email.mime.multipart + +from PyQt5.QtCore import QUrl + +from qutebrowser.browser import webelem, downloads +from qutebrowser.utils import log, objreg, message, usertypes, utils, urlutils + +try: + import cssutils +except (ImportError, re.error): + # Catching re.error because cssutils in earlier releases (<= 1.0) is broken + # on Python 3.5 + # See https://bitbucket.org/cthedot/cssutils/issues/52 + cssutils = None + +_File = collections.namedtuple('_File', + ['content', 'content_type', 'content_location', + 'transfer_encoding']) + + +_CSS_URL_PATTERNS = [re.compile(x) for x in [ + r"@import\s+'(?P[^']+)'", + r'@import\s+"(?P[^"]+)"', + r'''url\((?P[^'"][^)]*)\)''', + r'url\("(?P[^"]+)"\)', + r"url\('(?P[^']+)'\)", +]] + + +def _get_css_imports_regex(data): + """Return all assets that are referenced in the given CSS document. + + The returned URLs are relative to the stylesheet's URL. + + Args: + data: The content of the stylesheet to scan as string. + """ + urls = [] + for pattern in _CSS_URL_PATTERNS: + for match in pattern.finditer(data): + url = match.group("url") + if url: + urls.append(url) + return urls + + +def _get_css_imports_cssutils(data, inline=False): + """Return all assets that are referenced in the given CSS document. + + The returned URLs are relative to the stylesheet's URL. + + Args: + data: The content of the stylesheet to scan as string. + inline: True if the argument is a inline HTML style attribute. + """ + # We don't care about invalid CSS data, this will only litter the log + # output with CSS errors + parser = cssutils.CSSParser(loglevel=100, + fetcher=lambda url: (None, ""), validate=False) + if not inline: + sheet = parser.parseString(data) + return list(cssutils.getUrls(sheet)) + else: + urls = [] + declaration = parser.parseStyle(data) + # prop = background, color, margin, ... + for prop in declaration: + # value = red, 10px, url(foobar), ... + for value in prop.propertyValue: + if isinstance(value, cssutils.css.URIValue): + if value.uri: + urls.append(value.uri) + return urls + + +def _get_css_imports(data, inline=False): + """Return all assets that are referenced in the given CSS document. + + The returned URLs are relative to the stylesheet's URL. + + Args: + data: The content of the stylesheet to scan as string. + inline: True if the argument is a inline HTML style attribute. + """ + if cssutils is None: + return _get_css_imports_regex(data) + else: + return _get_css_imports_cssutils(data, inline) + + +def _check_rel(element): + """Return true if the element's rel attribute fits our criteria. + + rel has to contain 'stylesheet' or 'icon'. Also returns True if the rel + attribute is unset. + + Args: + element: The WebElementWrapper which should be checked. + """ + if 'rel' not in element: + return True + must_have = {'stylesheet', 'icon'} + rels = [rel.lower() for rel in element['rel'].split(' ')] + return any(rel in rels for rel in must_have) + + +MHTMLPolicy = email.policy.default.clone(linesep='\r\n', max_line_length=0) + + +# Encode the file using base64 encoding. +E_BASE64 = email.encoders.encode_base64 + + +# Encode the file using MIME quoted-printable encoding. +E_QUOPRI = email.encoders.encode_quopri + + +class MHTMLWriter(): + + """A class for outputting multiple files to a MHTML document. + + Attributes: + root_content: The root content as bytes. + content_location: The url of the page as str. + content_type: The MIME-type of the root content as str. + _files: Mapping of location->_File namedtuple. + """ + + def __init__(self, root_content, content_location, content_type): + self.root_content = root_content + self.content_location = content_location + self.content_type = content_type + self._files = {} + + def add_file(self, location, content, content_type=None, + transfer_encoding=E_QUOPRI): + """Add a file to the given MHTML collection. + + Args: + location: The original location (URL) of the file. + content: The binary content of the file. + content_type: The MIME-type of the content (if available) + transfer_encoding: The transfer encoding to use for this file. + """ + self._files[location] = _File( + content=content, content_type=content_type, + content_location=location, transfer_encoding=transfer_encoding, + ) + + def write_to(self, fp): + """Output the MHTML file to the given file-like object. + + Args: + fp: The file-object, opened in "wb" mode. + """ + msg = email.mime.multipart.MIMEMultipart( + 'related', '---=_qute-{}'.format(uuid.uuid4())) + + root = self._create_root_file() + msg.attach(root) + + for _, file_data in sorted(self._files.items()): + msg.attach(self._create_file(file_data)) + + gen = email.generator.BytesGenerator(fp, policy=MHTMLPolicy) + gen.flatten(msg) + + def _create_root_file(self): + """Return the root document as MIMEMultipart.""" + root_file = _File( + content=self.root_content, content_type=self.content_type, + content_location=self.content_location, transfer_encoding=E_QUOPRI, + ) + return self._create_file(root_file) + + def _create_file(self, f): + """Return the single given file as MIMEMultipart.""" + msg = email.mime.multipart.MIMEMultipart() + msg['Content-Location'] = f.content_location + # Get rid of the default type multipart/mixed + del msg['Content-Type'] + if f.content_type: + msg.set_type(f.content_type) + msg.set_payload(f.content) + f.transfer_encoding(msg) + return msg + + +class _Downloader(): + + """A class to download whole websites. + + Attributes: + web_view: The QWebView which contains the website that will be saved. + dest: Destination filename. + writer: The MHTMLWriter object which is used to save the page. + loaded_urls: A set of QUrls of finished asset downloads. + pending_downloads: A set of unfinished (url, DownloadItem) tuples. + _finished: A flag indicating if the file has already been written. + _used: A flag indicating if the downloader has already been used. + """ + + def __init__(self, web_view, dest): + self.web_view = web_view + self.dest = dest + self.writer = None + self.loaded_urls = {web_view.url()} + self.pending_downloads = set() + self._finished = False + self._used = False + + def run(self): + """Download and save the page. + + The object must not be reused, you should create a new one if + you want to download another page. + """ + if self._used: + raise ValueError("Downloader already used") + self._used = True + web_url = self.web_view.url() + web_frame = self.web_view.page().mainFrame() + + self.writer = MHTMLWriter( + web_frame.toHtml().encode('utf-8'), + content_location=urlutils.encoded_url(web_url), + # I've found no way of getting the content type of a QWebView, but + # since we're using .toHtml, it's probably safe to say that the + # content-type is HTML + content_type='text/html; charset="UTF-8"', + ) + # Currently only downloading (stylesheets),