From c45019f0a0542ea684e5b57a3c8e34db551bc7bb Mon Sep 17 00:00:00 2001 From: Florian Bruhin Date: Tue, 7 Mar 2017 20:25:13 +0100 Subject: [PATCH] Handle invalid UTF8 data in hostblock lists Fixes #2301 --- CHANGELOG.asciidoc | 1 + qutebrowser/browser/adblock.py | 24 ++++++++++-- tests/unit/browser/test_adblock.py | 62 ++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index d7f5b62bc..5edaa4f61 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -42,6 +42,7 @@ Fixed - Download suffixes like (1) are now correctly stripped with QtWebEngine - Crash when trying to print a tab which was closed in the meantime - Crash when trying to open a file twice on Windows +- Crash when updating adblock lists with invalid UTF8-chars in them v0.10.0 ------- diff --git a/qutebrowser/browser/adblock.py b/qutebrowser/browser/adblock.py index 27fb50603..7be0dfaf9 100644 --- a/qutebrowser/browser/adblock.py +++ b/qutebrowser/browser/adblock.py @@ -58,7 +58,7 @@ def get_fileobj(byte_io): byte_io = zf.open(filename, mode='r') else: byte_io.seek(0) # rewind what zipfile.is_zipfile did - return io.TextIOWrapper(byte_io, encoding='utf-8') + return byte_io def is_whitelisted_host(host): @@ -147,7 +147,7 @@ class HostBlocker: with open(filename, 'r', encoding='utf-8') as f: for line in f: target.add(line.strip()) - except OSError: + except (OSError, UnicodeDecodeError): log.misc.exception("Failed to read host blocklist!") return True @@ -219,13 +219,27 @@ class HostBlocker: line_count = 0 try: f = get_fileobj(byte_io) - except (OSError, UnicodeDecodeError, zipfile.BadZipFile, - zipfile.LargeZipFile, LookupError) as e: + except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile, + LookupError) as e: message.error("adblock: Error while reading {}: {} - {}".format( byte_io.name, e.__class__.__name__, e)) return + for line in f: + if line.startswith(b'#'): + # Ignoring comments early so we don't have to care about + # encoding errors in them. + continue + line_count += 1 + + try: + line = line.decode('utf-8') + except UnicodeDecodeError: + log.misc.error("Failed to decode: {!r}".format(line)) + error_count += 1 + continue + # Remove comments try: hash_idx = line.index('#') @@ -245,9 +259,11 @@ class HostBlocker: host = parts[1] else: error_count += 1 + log.misc.error("Failed to parse: {!r}".format(line)) continue if host not in self.WHITELISTED: self._blocked_hosts.add(host) + log.misc.debug("{}: read {} lines".format(byte_io.name, line_count)) if error_count > 0: message.error("adblock: {} read errors for {}".format( diff --git a/tests/unit/browser/test_adblock.py b/tests/unit/browser/test_adblock.py index 3caf6e526..0c1d95bca 100644 --- a/tests/unit/browser/test_adblock.py +++ b/tests/unit/browser/test_adblock.py @@ -312,6 +312,68 @@ def test_failed_dl_update(config_stub, basedir, download_stub, assert_urls(host_blocker, whitelisted=[]) +@pytest.mark.parametrize('location', ['content', 'comment']) +def test_invalid_utf8(config_stub, download_stub, tmpdir, caplog, location): + """Make sure invalid UTF-8 is handled correctly. + + See https://github.com/qutebrowser/qutebrowser/issues/2301 + """ + blocklist = tmpdir / 'blocklist' + if location == 'comment': + blocklist.write_binary(b'# nbsp: \xa0\n') + else: + assert location == 'content' + blocklist.write_binary(b'https://www.example.org/\xa0') + for url in BLOCKLIST_HOSTS: + blocklist.write(url + '\n', mode='a') + + config_stub.data = { + 'content': { + 'host-block-lists': [QUrl(str(blocklist))], + 'host-blocking-enabled': True, + 'host-blocking-whitelist': None, + } + } + host_blocker = adblock.HostBlocker() + host_blocker.adblock_update() + finished_signal = host_blocker._in_progress[0].finished + + if location == 'content': + with caplog.at_level(logging.ERROR): + finished_signal.emit() + expected = (r"Failed to decode: " + r"b'https://www.example.org/\xa0localhost\n'") + assert caplog.records[-2].message == expected + else: + finished_signal.emit() + + host_blocker.read_hosts() + assert_urls(host_blocker, whitelisted=[]) + + +def test_invalid_utf8_compiled(config_stub, tmpdir, monkeypatch, caplog): + """Make sure invalid UTF-8 in the compiled file is handled.""" + data_dir = tmpdir / 'data' + config_dir = tmpdir / 'config' + monkeypatch.setattr(adblock.standarddir, 'data', lambda: data_dir) + monkeypatch.setattr(adblock.standarddir, 'config', lambda: config_dir) + + config_stub.data = { + 'content': { + 'host-block-lists': [], + } + } + + (config_dir / 'blocked-hosts').write_binary( + b'https://www.example.org/\xa0') + (data_dir / 'blocked-hosts').ensure() + + host_blocker = adblock.HostBlocker() + with caplog.at_level(logging.ERROR): + host_blocker.read_hosts() + assert caplog.records[-1].message == "Failed to read host blocklist!" + + def test_blocking_with_whitelist(config_stub, basedir, download_stub, data_tmpdir, tmpdir): """Ensure hosts in host-blocking-whitelist are never blocked."""