Refactor start_download to a class

2015-09-20 17:57:32 +02:00 · 2015-09-20 17:57:32 +02:00 · 111feebf89
commit 111feebf89
parent 49a32f0041
1 changed files with 98 additions and 73 deletions
--- a/qutebrowser/misc/mhtml.py
+++ b/qutebrowser/misc/mhtml.py
@ -182,83 +182,108 @@ class MHTMLWriter(object):
        fp.write(b"\r\n\r\n")
 class _Downloader(object):
    """A class to download whole websites."""
    def __init__(self, web_view, dest):
        self.web_view = web_view
        self.dest = dest
        self.writer = MHTMLWriter()
        self.loaded_urls = set()
        self.pending_downloads = set()
    def run(self):
        """Download and save the page.
        The object must not be reused, you should create a new one if
        you want to download another page.
        """
        download_manager = objreg.get("download-manager", scope="window",
                                      window="current")
        web_url_str = self.web_view.url().toString()
        web_frame = self.web_view.page().mainFrame()
        self.writer.root_content = web_frame.toHtml().encode("utf-8")
        self.writer.content_location = web_url_str
        # I've found no way of getting the content type of a QWebView, but
        # since we're using .toHtml, it's probably safe to say that the
        # content-type is HTML
        self.writer.content_type = 'text/html; charset="UTF-8"'
        # Currently only downloading <link> (stylesheets), <script>
        # (javascript) and <img> (image) elements.
        elements = (web_frame.findAllElements("link") +
                    web_frame.findAllElements("script") +
                    web_frame.findAllElements("img"))
        for element in elements:
            element_url = element.attribute("src")
            if not element_url:
                element_url = element.attribute("href")
            if not element_url:
                # Might be a local <script> tag or something else
                continue
            absolute_url = QUrl(urljoin(web_url_str, element_url))
            # Prevent loading an asset twice
            if absolute_url in self.loaded_urls:
                continue
            self.loaded_urls.add(absolute_url)
            log.misc.debug("asset at %s", absolute_url)
            item = download_manager.get(absolute_url, fileobj=io.BytesIO(),
                                        auto_remove=True)
            self.pending_downloads.add(item)
            item.finished.connect(
                functools.partial(self.finished, absolute_url, item))
            item.error.connect(
                functools.partial(self.error, absolute_url, item))
            item.cancelled.connect(
                functools.partial(self.error, absolute_url, item))
    def finished(self, url, item):
        """Callback when a single asset is downloaded.
        Args:
            url: The original url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager
        """
        self.pending_downloads.remove(item)
        mime = item.raw_headers.get(b"Content-Type", b"")
        mime = mime.decode("ascii", "ignore")
        encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
        self.writer.add_file(url.toString(), item.fileobj.getvalue(), mime,
                             encode)
        if self.pending_downloads:
            return
        self.finish_file()
    def error(self, url, item, *_args):
        """Callback when a download error occurs.
        Args:
            url: The orignal url of the asset as QUrl.
            item: The DownloadItem given by the DownloadManager.
        """
        self.pending_downloads.remove(item)
        self.writer.add_file(url.toString(), b"")
        if self.pending_downloads:
            return
        self.finish_file()
    def finish_file(self):
        """Save the file to the filename given in __init__."""
        log.misc.debug("All assets downloaded, ready to finish off!")
        with open(self.dest, "wb") as file_output:
            self.writer.write_to(file_output)
 def start_download(dest):
    """Start downloading the current page and all assets to a MHTML file.
    Args:
        dest: The filename where the resulting file should be saved.
    """
    download_manager = objreg.get("download-manager", scope="window",
                                  window="current")
    web_view = objreg.get("webview", scope="tab", tab="current")
-    web_url_str = web_view.url().toString()
+    loader = _Downloader(web_view, dest)
-    web_frame = web_view.page().mainFrame()
+    loader.run()
    writer = MHTMLWriter()
    writer.root_content = web_frame.toHtml().encode("utf-8")
    writer.content_location = web_url_str
    # I've found no way of getting the content type of a QWebView, but since
    # we're using .toHtml, it's probably safe to say that the content-type is
    # HTML
    writer.content_type = 'text/html; charset="UTF-8"'
    # Currently only downloading <link> (stylesheets), <script> (javascript)
    # and <img> (image) elements.
    elements = (web_frame.findAllElements("link") +
                web_frame.findAllElements("script") +
                web_frame.findAllElements("img"))
    loaded_urls = set()
    pending_downloads = set()
    # Callback for when a single asset is downloaded
    # closes over the local variables
    def finished(name, item):
        pending_downloads.remove(item)
        mime = item.raw_headers.get(b"Content-Type", b"")
        mime = mime.decode("ascii", "ignore")
        encode = E_QUOPRI if mime.startswith("text/") else E_BASE64
        writer.add_file(name, item.fileobj.getvalue(), mime, encode)
        if pending_downloads:
            return
        finish_file()
    def error(name, item, *_args):
        pending_downloads.remove(item)
        writer.add_file(name, b"")
        if pending_downloads:
            return
        finish_file()
    def finish_file():
        # If we get here, all assets are downloaded and we're ready to finis
        # the file
        log.misc.debug("All assets downloaded, ready to finish off!")
        with open(dest, "wb") as file_output:
            writer.write_to(file_output)
    for element in elements:
        element_url = element.attribute("src")
        if not element_url:
            element_url = element.attribute("href")
        if not element_url:
            # Might be a local <script> tag or something else
            continue
        absolute_url_str = urljoin(web_url_str, element_url)
        # Prevent loading an asset twice
        if absolute_url_str in loaded_urls:
            continue
        loaded_urls.add(absolute_url_str)
        log.misc.debug("asset at %s", absolute_url_str)
        absolute_url = QUrl(absolute_url_str)
        fileobj = io.BytesIO()
        item = download_manager.get(absolute_url, fileobj=fileobj,
                                    auto_remove=True)
        pending_downloads.add(item)
        item.finished.connect(
            functools.partial(finished, absolute_url_str, item))
        item.error.connect(
            functools.partial(error, absolute_url_str, item))
        item.cancelled.connect(
            functools.partial(error, absolute_url_str, item))