Initial version of website downloader

Saving websites as MHTML via :download-whole Still needs some cleanup and a "ask for save path".
2015-09-19 20:06:15 +02:00 · 2015-09-19 20:06:15 +02:00 · fbe5386e56
commit fbe5386e56
parent 99e090db78
3 changed files with 236 additions and 1 deletions
--- a/qutebrowser/browser/commands.py
+++ b/qutebrowser/browser/commands.py
@ -42,7 +42,7 @@ from qutebrowser.keyinput import modeman
 from qutebrowser.utils import (message, usertypes, log, qtutils, urlutils,
                               objreg, utils)
 from qutebrowser.utils.usertypes import KeyMode
-from qutebrowser.misc import editor, guiprocess
+from qutebrowser.misc import editor, guiprocess, mhtml


 class CommandDispatcher:
@ -1157,6 +1157,16 @@ class CommandDispatcher:
            page = self._current_widget().page()
            download_manager.get(self._current_url(), page=page)

+    @cmdutils.register(instance='command-dispatcher', scope='window')
+    def download_whole(self, dest):
+        """Download the current page as a MHTML file, including all assets
+        (e.g. images)
+
+        Args:
+            dest: The file path to write the download to.
+        """
+        mhtml.start_download(dest)
+
    @cmdutils.register(instance='command-dispatcher', scope='window',
                       deprecated="Use :download instead.")
    def download_page(self):
--- a/qutebrowser/browser/downloads.py
+++ b/qutebrowser/browser/downloads.py
@ -201,6 +201,7 @@ class DownloadItem(QObject):
        fileobj: The file object to download the file to.
        reply: The QNetworkReply associated with this download.
        retry_info: A RetryInfo instance.
+        raw_headers: The headers sent by the server.
        _filename: The filename of the download.
        _redirects: How many time we were redirected already.
        _buffer: A BytesIO object to buffer incoming data until we know the
@ -255,6 +256,7 @@ class DownloadItem(QObject):
        self._filename = None
        self.init_reply(reply)
        self._win_id = win_id
+        self.raw_headers = {}

    def __repr__(self):
        return utils.get_repr(self, basename=self.basename)
@ -354,6 +356,7 @@ class DownloadItem(QObject):
        reply.finished.connect(self.on_reply_finished)
        reply.error.connect(self.on_reply_error)
        reply.readyRead.connect(self.on_ready_read)
+        reply.metaDataChanged.connect(self.on_meta_data_change)
        self.retry_info = RetryInfo(request=reply.request(),
                                    manager=reply.manager())
        if not self.fileobj:
@ -582,6 +585,9 @@ class DownloadItem(QObject):
        if code == QNetworkReply.OperationCanceledError:
            return
        else:
+            if self.reply is None:
+                log.downloads.debug("QNetworkReply disappeared %s", self)
+                return
            self._die(self.reply.errorString())

    @pyqtSlot()
@ -593,6 +599,14 @@ class DownloadItem(QObject):
        if data is not None:
            self._buffer.write(data)

+    @pyqtSlot()
+    def on_meta_data_change(self):
+        if self.reply is None:
+            return
+        self.raw_headers = {}
+        for key, value in self.reply.rawHeaderPairs():
+            self.raw_headers[bytes(key)] = bytes(value)
+
    def _handle_redirect(self):
        """Handle a HTTP redirect.

--- a/qutebrowser/misc/mhtml.py
+++ b/qutebrowser/misc/mhtml.py
@ -0,0 +1,211 @@
+# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
+
+# Copyright 2015 Daniel Schadt
+#
+# This file is part of qutebrowser.
+#
+# qutebrowser is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# qutebrowser is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Utils for writing a MHTML file."""
+
+import functools
+import io
+
+from collections import namedtuple
+from base64 import b64encode
+from urllib.parse import urljoin
+
+from PyQt5.QtCore import QUrl
+from PyQt5.QtNetwork import QNetworkRequest, QNetworkReply
+
+from qutebrowser.utils import log, objreg
+
+
+_File = namedtuple("_File",
+                   "content content_type content_location transfer_encoding")
+
+
+E_NONE = (None, lambda x: x)
+"""No transfer encoding, copy the bytes from input to output"""
+
+E_BASE64 = ("BASE64", b64encode)
+"""Encode the file using base64 encoding"""
+
+
+class MHTMLWriter(object):
+    """A class for aggregating multiple files and outputting them to a MHTML
+    file."""
+
+    BOUNDARY = b"qute-mhtml"
+
+    def __init__(self, root_content=None, content_location=None,
+                 content_type=None):
+        self.root_content = root_content
+        self.content_location = content_location
+        self.content_type = None
+
+        self._files = {}
+
+    def add_file(self, location, content, content_type=None,
+                 transfer_encoding=E_BASE64):
+        """Add a file to the given MHTML collection.
+
+        Args:
+            location: The original location (URL) of the file.
+            content: The binary content of the file.
+            content_type: The MIME-type of the content (if available)
+            transfer_encoding: The transfer encoding to use for this file.
+        """
+        self._files[location] = _File(
+            content=content, content_type=content_type,
+            content_location=location, transfer_encoding=transfer_encoding,
+        )
+
+    def remove_file(self, location):
+        """Remove a file.
+
+        Args:
+            location: The URL that identifies the file.
+        """
+        del self._files[location]
+
+    def write_to(self, fp):
+        """Output the MHTML file to the given file-like object
+
+        Args:
+            fp: The file-object, openend in "wb" mode.
+        """
+        self._output_header(fp)
+        self._output_root_file(fp)
+        for file_data in self._files.values():
+            self._output_file(fp, file_data)
+
+    def _output_header(self, fp):
+        if self.content_location is None:
+            raise ValueError("content_location must be set")
+        if self.content_type is None:
+            raise ValueError("content_type must be set for the root document")
+
+        fp.write(b"Content-Location: ")
+        fp.write(self.content_location.encode("utf-8"))
+        fp.write(b'\nContent-Type: multipart/related;boundary="')
+        fp.write(self.BOUNDARY)
+        fp.write(b'";type="')
+        fp.write(self.content_type.encode("utf-8"))
+        fp.write(b'"\n\n')
+
+    def _output_root_file(self, fp):
+        root_file = _File(
+            content=self.root_content, content_type=self.content_type,
+            content_location=self.content_location, transfer_encoding=E_BASE64
+        )
+        self._output_file(fp, root_file)
+
+    def _output_file(self, fp, file_struct):
+        fp.write(b"--")
+        fp.write(self.BOUNDARY)
+        fp.write(b"\nContent-Location: ")
+        fp.write(file_struct.content_location.encode("utf-8"))
+        if file_struct.content_type is not None:
+            fp.write(b"\nContent-Type: ")
+            fp.write(file_struct.content_type.encode("utf-8"))
+        encoding_name, encoding_func = file_struct.transfer_encoding
+        if encoding_name:
+            fp.write(b"\nContent-Transfer-Encoding: ")
+            fp.write(encoding_name.encode("utf-8"))
+        fp.write(b"\n\n")
+        fp.write(encoding_func(file_struct.content))
+        fp.write(b"\n\n")
+
+
+def start_download(dest):
+    """Start downloading the current page and all assets to a MHTML file.
+
+    Args:
+        dest: The filename where the resulting file should be saved.
+    """
+    download_manager = objreg.get("download-manager", scope="window",
+                                  window="current")
+    web_view = objreg.get("webview", scope="tab", tab="current")
+    web_url_str = web_view.url().toString()
+    web_frame = web_view.page().mainFrame()
+
+    writer = MHTMLWriter()
+    writer.root_content = web_frame.toHtml().encode("utf-8")
+    writer.content_location = web_url_str
+    # I've found no way of getting the content type of a QWebView, but since
+    # we're using .toHtml, it's probably safe to say that the content-type is
+    # HTML
+    writer.content_type = "text/html"
+    # Currently only downloading <link> (stylesheets), <script> (javascript) and
+    # <img> (image) elements.
+    elements = (web_frame.findAllElements("link") +
+                web_frame.findAllElements("script") +
+                web_frame.findAllElements("img"))
+
+    loaded_urls = set()
+    pending_downloads = set()
+
+    # Callback for when a single asset is downloaded
+    # closes over the local variables
+    def finished(name, item):
+        pending_downloads.remove(item)
+        mime = item.raw_headers.get(b"Content-Type", b"")
+        mime = mime.decode("ascii", "ignore")
+        writer.add_file(name, item.fileobj.getvalue(), mime)
+        if pending_downloads:
+            return
+        finish_file()
+
+    def error(item, *args):
+        pending_downloads.remove(item)
+        if pending_downloads:
+            return
+        finish_file()
+
+    def finish_file():
+        # If we get here, all assets are downloaded and we're ready to finis
+        # the file
+        log.misc.debug("All assets downloaded, ready to finish off!")
+        with open(dest, "wb") as file_output:
+            writer.write_to(file_output)
+
+
+    for element in elements:
+        element_url = element.attribute("src")
+        if not element_url:
+            element_url = element.attribute("href")
+        if not element_url:
+            # Might be a local <script> tag or something else
+            continue
+        absolute_url_str = urljoin(web_url_str, element_url)
+        name = absolute_url_str if element_url.startswith("//") else element_url
+        # Prevent loading an asset twice
+        if absolute_url_str in loaded_urls:
+            continue
+        loaded_urls.add(absolute_url_str)
+
+        log.misc.debug("asset at %s", absolute_url_str)
+        absolute_url = QUrl(absolute_url_str)
+
+        fileobj = io.BytesIO()
+        item = download_manager.get(absolute_url, fileobj=fileobj,
+                                    auto_remove=True)
+        pending_downloads.add(item)
+        item.finished.connect(
+            functools.partial(finished, name, item))
+        item.error.connect(
+            functools.partial(error, item))
+        item.cancelled.connect(
+            functools.partial(error, item))