urlmatch: Start UrlPattern

This commit is contained in:
Florian Bruhin 2018-02-14 20:03:01 +01:00
parent 541abb2324
commit b93c0dad5a
2 changed files with 157 additions and 0 deletions

View File

@ -0,0 +1,99 @@
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
# Copyright 2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
"""A Chromium-like URL matching pattern.
See:
https://developer.chrome.com/apps/match_patterns
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
"""
import contextlib
import urllib.parse
from qutebrowser.utils import utils
class UrlPattern:
"""A Chromium-like URL matching pattern."""
SCHEMES = ['https', 'http', 'ftp', 'file', 'chrome', 'qute', 'about']
def __init__(self, pattern):
# Make sure all attributes are initialized if we exit early.
self._pattern = pattern
self._match_all = False
self._match_subdomains = False
self._scheme = None
self._host = None
# > The special pattern <all_urls> matches any URL that starts with a
# > permitted scheme.
if pattern == '<all_urls>':
self._match_all = True
return
# > If the scheme is *, then it matches either http or https, and not
# > file, or ftp.
# Note we deviate from that, as per-URL settings aren't security
# relevant.
if pattern.startswith('*:'): # Any scheme
self._scheme = '*'
pattern = 'any:' + pattern[2:] # Make it parseable again
# We use urllib.parse instead of QUrl here because it can handle
# hosts with * in them.
parsed = urllib.parse.urlparse(pattern)
# "Changed in version 3.6: Out-of-range port numbers now raise
# ValueError, instead of returning None."
if parsed is None:
raise ValueError("Failed to parse {}".format(pattern))
self._init_scheme(parsed)
self._init_host(parsed)
self._init_path(parsed)
def _init_scheme(self, parsed):
if not parsed.scheme:
raise ValueError("No scheme given")
if parsed.scheme not in self.SCHEMES:
raise ValueError("Unknown scheme {}".format(parsed.scheme))
self._scheme = parsed.scheme
def _init_path(self, parsed):
# FIXME store somewhere
if self._scheme == 'about' and not parsed.path.strip():
raise ValueError("Pattern without path")
def _init_host(self, parsed):
if self._scheme != 'about' and not parsed.netloc.strip():
raise ValueError("Pattern without host")
host_parts = parsed.netloc.split('.')
if host_parts[0] == '*':
host_parts = host_parts[1:]
self._match_subdomains = True
self._host = '.'.join(host_parts)
if '*' in self._host:
# Only * or *.foo is allowed as host.
raise ValueError("Invalid host wildcard")
def __repr__(self):
return utils.get_repr(self, pattern=self._pattern, constructor=True)

View File

@ -0,0 +1,58 @@
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
# Copyright 2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
"""Tests for qutebrowser.utils.urlmatch.
Some data is inspired by Chromium's tests:
https://cs.chromium.org/chromium/src/extensions/common/url_pattern_unittest.cc
"""
import pytest
from qutebrowser.utils import urlmatch
@pytest.mark.parametrize('pattern, error', [
# Chromium: PARSE_ERROR_MISSING_SCHEME_SEPARATOR
("http", "No scheme given"),
("http:", "Pattern without host"),
("http:/", "Pattern without host"),
("about://", "Pattern without path"),
("http:/bar", "Pattern without host"),
# Chromium: PARSE_ERROR_EMPTY_HOST
("http://", "Pattern without host"),
("http:///", "Pattern without host"),
("http:// /", "Pattern without host"),
# Chromium: PARSE_ERROR_EMPTY_PATH
# FIXME: should we allow this or not?
# ("http://bar", "URLPattern::"),
# Chromium: PARSE_ERROR_INVALID_HOST_WILDCARD
("http://*foo/bar", "Invalid host wildcard"),
("http://foo.*.bar/baz", "Invalid host wildcard"),
("http://fo.*.ba:123/baz", "Invalid host wildcard"),
("http://foo.*/bar", "Invalid host wildcard"),
# Some more tests
])
def test_invalid_patterns(pattern, error):
with pytest.raises(ValueError, match=error):
urlmatch.UrlPattern(pattern)