From fbe5386e5698805c6583724b39fe0b62036aaa00 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Sep 2015 20:06:15 +0200 Subject: [PATCH 01/66] Initial version of website downloader Saving websites as MHTML via :download-whole Still needs some cleanup and a "ask for save path". --- qutebrowser/browser/commands.py | 12 +- qutebrowser/browser/downloads.py | 14 ++ qutebrowser/misc/mhtml.py | 211 +++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 qutebrowser/misc/mhtml.py diff --git a/qutebrowser/browser/commands.py b/qutebrowser/browser/commands.py index ae4b1da39..10f1d0233 100644 --- a/qutebrowser/browser/commands.py +++ b/qutebrowser/browser/commands.py @@ -42,7 +42,7 @@ from qutebrowser.keyinput import modeman from qutebrowser.utils import (message, usertypes, log, qtutils, urlutils, objreg, utils) from qutebrowser.utils.usertypes import KeyMode -from qutebrowser.misc import editor, guiprocess +from qutebrowser.misc import editor, guiprocess, mhtml class CommandDispatcher: @@ -1157,6 +1157,16 @@ class CommandDispatcher: page = self._current_widget().page() download_manager.get(self._current_url(), page=page) + @cmdutils.register(instance='command-dispatcher', scope='window') + def download_whole(self, dest): + """Download the current page as a MHTML file, including all assets + (e.g. images) + + Args: + dest: The file path to write the download to. + """ + mhtml.start_download(dest) + @cmdutils.register(instance='command-dispatcher', scope='window', deprecated="Use :download instead.") def download_page(self): diff --git a/qutebrowser/browser/downloads.py b/qutebrowser/browser/downloads.py index 70760acb5..568e7bf67 100644 --- a/qutebrowser/browser/downloads.py +++ b/qutebrowser/browser/downloads.py @@ -201,6 +201,7 @@ class DownloadItem(QObject): fileobj: The file object to download the file to. reply: The QNetworkReply associated with this download. retry_info: A RetryInfo instance. + raw_headers: The headers sent by the server. _filename: The filename of the download. _redirects: How many time we were redirected already. _buffer: A BytesIO object to buffer incoming data until we know the @@ -255,6 +256,7 @@ class DownloadItem(QObject): self._filename = None self.init_reply(reply) self._win_id = win_id + self.raw_headers = {} def __repr__(self): return utils.get_repr(self, basename=self.basename) @@ -354,6 +356,7 @@ class DownloadItem(QObject): reply.finished.connect(self.on_reply_finished) reply.error.connect(self.on_reply_error) reply.readyRead.connect(self.on_ready_read) + reply.metaDataChanged.connect(self.on_meta_data_change) self.retry_info = RetryInfo(request=reply.request(), manager=reply.manager()) if not self.fileobj: @@ -582,6 +585,9 @@ class DownloadItem(QObject): if code == QNetworkReply.OperationCanceledError: return else: + if self.reply is None: + log.downloads.debug("QNetworkReply disappeared %s", self) + return self._die(self.reply.errorString()) @pyqtSlot() @@ -593,6 +599,14 @@ class DownloadItem(QObject): if data is not None: self._buffer.write(data) + @pyqtSlot() + def on_meta_data_change(self): + if self.reply is None: + return + self.raw_headers = {} + for key, value in self.reply.rawHeaderPairs(): + self.raw_headers[bytes(key)] = bytes(value) + def _handle_redirect(self): """Handle a HTTP redirect. diff --git a/qutebrowser/misc/mhtml.py b/qutebrowser/misc/mhtml.py new file mode 100644 index 000000000..3afc32c02 --- /dev/null +++ b/qutebrowser/misc/mhtml.py @@ -0,0 +1,211 @@ +# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: + +# Copyright 2015 Daniel Schadt +# +# This file is part of qutebrowser. +# +# qutebrowser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# qutebrowser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with qutebrowser. If not, see . + +"""Utils for writing a MHTML file.""" + +import functools +import io + +from collections import namedtuple +from base64 import b64encode +from urllib.parse import urljoin + +from PyQt5.QtCore import QUrl +from PyQt5.QtNetwork import QNetworkRequest, QNetworkReply + +from qutebrowser.utils import log, objreg + + +_File = namedtuple("_File", + "content content_type content_location transfer_encoding") + + +E_NONE = (None, lambda x: x) +"""No transfer encoding, copy the bytes from input to output""" + +E_BASE64 = ("BASE64", b64encode) +"""Encode the file using base64 encoding""" + + +class MHTMLWriter(object): + """A class for aggregating multiple files and outputting them to a MHTML + file.""" + + BOUNDARY = b"qute-mhtml" + + def __init__(self, root_content=None, content_location=None, + content_type=None): + self.root_content = root_content + self.content_location = content_location + self.content_type = None + + self._files = {} + + def add_file(self, location, content, content_type=None, + transfer_encoding=E_BASE64): + """Add a file to the given MHTML collection. + + Args: + location: The original location (URL) of the file. + content: The binary content of the file. + content_type: The MIME-type of the content (if available) + transfer_encoding: The transfer encoding to use for this file. + """ + self._files[location] = _File( + content=content, content_type=content_type, + content_location=location, transfer_encoding=transfer_encoding, + ) + + def remove_file(self, location): + """Remove a file. + + Args: + location: The URL that identifies the file. + """ + del self._files[location] + + def write_to(self, fp): + """Output the MHTML file to the given file-like object + + Args: + fp: The file-object, openend in "wb" mode. + """ + self._output_header(fp) + self._output_root_file(fp) + for file_data in self._files.values(): + self._output_file(fp, file_data) + + def _output_header(self, fp): + if self.content_location is None: + raise ValueError("content_location must be set") + if self.content_type is None: + raise ValueError("content_type must be set for the root document") + + fp.write(b"Content-Location: ") + fp.write(self.content_location.encode("utf-8")) + fp.write(b'\nContent-Type: multipart/related;boundary="') + fp.write(self.BOUNDARY) + fp.write(b'";type="') + fp.write(self.content_type.encode("utf-8")) + fp.write(b'"\n\n') + + def _output_root_file(self, fp): + root_file = _File( + content=self.root_content, content_type=self.content_type, + content_location=self.content_location, transfer_encoding=E_BASE64 + ) + self._output_file(fp, root_file) + + def _output_file(self, fp, file_struct): + fp.write(b"--") + fp.write(self.BOUNDARY) + fp.write(b"\nContent-Location: ") + fp.write(file_struct.content_location.encode("utf-8")) + if file_struct.content_type is not None: + fp.write(b"\nContent-Type: ") + fp.write(file_struct.content_type.encode("utf-8")) + encoding_name, encoding_func = file_struct.transfer_encoding + if encoding_name: + fp.write(b"\nContent-Transfer-Encoding: ") + fp.write(encoding_name.encode("utf-8")) + fp.write(b"\n\n") + fp.write(encoding_func(file_struct.content)) + fp.write(b"\n\n") + + +def start_download(dest): + """Start downloading the current page and all assets to a MHTML file. + + Args: + dest: The filename where the resulting file should be saved. + """ + download_manager = objreg.get("download-manager", scope="window", + window="current") + web_view = objreg.get("webview", scope="tab", tab="current") + web_url_str = web_view.url().toString() + web_frame = web_view.page().mainFrame() + + writer = MHTMLWriter() + writer.root_content = web_frame.toHtml().encode("utf-8") + writer.content_location = web_url_str + # I've found no way of getting the content type of a QWebView, but since + # we're using .toHtml, it's probably safe to say that the content-type is + # HTML + writer.content_type = "text/html" + # Currently only downloading (stylesheets),