Handle invalid UTF8 data in hostblock lists

Fixes #2301
This commit is contained in:
Florian Bruhin 2017-03-07 20:25:13 +01:00
parent 3cc4f69125
commit c45019f0a0
3 changed files with 83 additions and 4 deletions

View File

@ -42,6 +42,7 @@ Fixed
- Download suffixes like (1) are now correctly stripped with QtWebEngine
- Crash when trying to print a tab which was closed in the meantime
- Crash when trying to open a file twice on Windows
- Crash when updating adblock lists with invalid UTF8-chars in them
v0.10.0
-------

View File

@ -58,7 +58,7 @@ def get_fileobj(byte_io):
byte_io = zf.open(filename, mode='r')
else:
byte_io.seek(0) # rewind what zipfile.is_zipfile did
return io.TextIOWrapper(byte_io, encoding='utf-8')
return byte_io
def is_whitelisted_host(host):
@ -147,7 +147,7 @@ class HostBlocker:
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
target.add(line.strip())
except OSError:
except (OSError, UnicodeDecodeError):
log.misc.exception("Failed to read host blocklist!")
return True
@ -219,13 +219,27 @@ class HostBlocker:
line_count = 0
try:
f = get_fileobj(byte_io)
except (OSError, UnicodeDecodeError, zipfile.BadZipFile,
zipfile.LargeZipFile, LookupError) as e:
except (OSError, zipfile.BadZipFile, zipfile.LargeZipFile,
LookupError) as e:
message.error("adblock: Error while reading {}: {} - {}".format(
byte_io.name, e.__class__.__name__, e))
return
for line in f:
if line.startswith(b'#'):
# Ignoring comments early so we don't have to care about
# encoding errors in them.
continue
line_count += 1
try:
line = line.decode('utf-8')
except UnicodeDecodeError:
log.misc.error("Failed to decode: {!r}".format(line))
error_count += 1
continue
# Remove comments
try:
hash_idx = line.index('#')
@ -245,9 +259,11 @@ class HostBlocker:
host = parts[1]
else:
error_count += 1
log.misc.error("Failed to parse: {!r}".format(line))
continue
if host not in self.WHITELISTED:
self._blocked_hosts.add(host)
log.misc.debug("{}: read {} lines".format(byte_io.name, line_count))
if error_count > 0:
message.error("adblock: {} read errors for {}".format(

View File

@ -312,6 +312,68 @@ def test_failed_dl_update(config_stub, basedir, download_stub,
assert_urls(host_blocker, whitelisted=[])
@pytest.mark.parametrize('location', ['content', 'comment'])
def test_invalid_utf8(config_stub, download_stub, tmpdir, caplog, location):
"""Make sure invalid UTF-8 is handled correctly.
See https://github.com/qutebrowser/qutebrowser/issues/2301
"""
blocklist = tmpdir / 'blocklist'
if location == 'comment':
blocklist.write_binary(b'# nbsp: \xa0\n')
else:
assert location == 'content'
blocklist.write_binary(b'https://www.example.org/\xa0')
for url in BLOCKLIST_HOSTS:
blocklist.write(url + '\n', mode='a')
config_stub.data = {
'content': {
'host-block-lists': [QUrl(str(blocklist))],
'host-blocking-enabled': True,
'host-blocking-whitelist': None,
}
}
host_blocker = adblock.HostBlocker()
host_blocker.adblock_update()
finished_signal = host_blocker._in_progress[0].finished
if location == 'content':
with caplog.at_level(logging.ERROR):
finished_signal.emit()
expected = (r"Failed to decode: "
r"b'https://www.example.org/\xa0localhost\n'")
assert caplog.records[-2].message == expected
else:
finished_signal.emit()
host_blocker.read_hosts()
assert_urls(host_blocker, whitelisted=[])
def test_invalid_utf8_compiled(config_stub, tmpdir, monkeypatch, caplog):
"""Make sure invalid UTF-8 in the compiled file is handled."""
data_dir = tmpdir / 'data'
config_dir = tmpdir / 'config'
monkeypatch.setattr(adblock.standarddir, 'data', lambda: data_dir)
monkeypatch.setattr(adblock.standarddir, 'config', lambda: config_dir)
config_stub.data = {
'content': {
'host-block-lists': [],
}
}
(config_dir / 'blocked-hosts').write_binary(
b'https://www.example.org/\xa0')
(data_dir / 'blocked-hosts').ensure()
host_blocker = adblock.HostBlocker()
with caplog.at_level(logging.ERROR):
host_blocker.read_hosts()
assert caplog.records[-1].message == "Failed to read host blocklist!"
def test_blocking_with_whitelist(config_stub, basedir, download_stub,
data_tmpdir, tmpdir):
"""Ensure hosts in host-blocking-whitelist are never blocked."""