294 lines
9.8 KiB
Python
294 lines
9.8 KiB
Python
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
|
|
|
|
# Copyright 2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
|
|
#
|
|
# This file is part of qutebrowser.
|
|
#
|
|
# qutebrowser is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# qutebrowser is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
"""A Chromium-like URL matching pattern.
|
|
|
|
See:
|
|
https://developer.chrome.com/apps/match_patterns
|
|
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
|
|
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
|
|
"""
|
|
|
|
import ipaddress
|
|
import fnmatch
|
|
import urllib.parse
|
|
|
|
from qutebrowser.utils import utils, qtutils
|
|
|
|
|
|
class ParseError(Exception):
|
|
|
|
"""Raised when a pattern could not be parsed."""
|
|
|
|
|
|
class UrlPattern:
|
|
|
|
"""A Chromium-like URL matching pattern.
|
|
|
|
Class attributes:
|
|
_DEFAULT_PORTS: The default ports used for schemes which support ports.
|
|
_SCHEMES_WITHOUT_HOST: Schemes which don't need a host.
|
|
|
|
Attributes:
|
|
_pattern: The given pattern as string.
|
|
_match_all: Whether the pattern should match all URLs.
|
|
_match_subdomains: Whether the pattern should match subdomains of the
|
|
given host.
|
|
_scheme: The scheme to match to, or None to match any scheme.
|
|
Note that with Chromium, '*'/None only matches http/https and
|
|
not file/ftp. We deviate from that as per-URL settings aren't
|
|
security relevant.
|
|
_host: The host to match to, or None for any host.
|
|
_path: The path to match to, or None for any path.
|
|
_port: The port to match to as integer, or None for any port.
|
|
"""
|
|
|
|
_DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21}
|
|
_SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript']
|
|
|
|
def __init__(self, pattern):
|
|
# Make sure all attributes are initialized if we exit early.
|
|
self._pattern = pattern
|
|
self._match_all = False
|
|
self._match_subdomains = False
|
|
self._scheme = None
|
|
self._host = None
|
|
self._path = None
|
|
self._port = None
|
|
|
|
# > The special pattern <all_urls> matches any URL that starts with a
|
|
# > permitted scheme.
|
|
if pattern == '<all_urls>':
|
|
self._match_all = True
|
|
return
|
|
|
|
if '\0' in pattern:
|
|
raise ParseError("May not contain NUL byte")
|
|
|
|
pattern = self._fixup_pattern(pattern)
|
|
|
|
# We use urllib.parse instead of QUrl here because it can handle
|
|
# hosts with * in them.
|
|
try:
|
|
parsed = urllib.parse.urlparse(pattern)
|
|
except ValueError as e:
|
|
raise ParseError(str(e))
|
|
|
|
assert parsed is not None
|
|
|
|
self._init_scheme(parsed)
|
|
self._init_host(parsed)
|
|
self._init_path(parsed)
|
|
self._init_port(parsed)
|
|
|
|
def _to_tuple(self):
|
|
"""Get a pattern with information used for __eq__/__hash__."""
|
|
return (self._match_all, self._match_subdomains, self._scheme,
|
|
self._host, self._path, self._port)
|
|
|
|
def __hash__(self):
|
|
return hash(self._to_tuple())
|
|
|
|
def __eq__(self, other):
|
|
if not isinstance(other, UrlPattern):
|
|
return NotImplemented
|
|
# pylint: disable=protected-access
|
|
return self._to_tuple() == other._to_tuple()
|
|
|
|
def __repr__(self):
|
|
return utils.get_repr(self, pattern=self._pattern, constructor=True)
|
|
|
|
def __str__(self):
|
|
return self._pattern
|
|
|
|
def _fixup_pattern(self, pattern):
|
|
"""Make sure the given pattern is parseable by urllib.parse."""
|
|
if pattern.startswith('*:'): # Any scheme, but *:// is unparseable
|
|
pattern = 'any:' + pattern[2:]
|
|
|
|
schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST)
|
|
if '://' not in pattern and not pattern.startswith(schemes):
|
|
pattern = 'any://' + pattern
|
|
|
|
# Chromium handles file://foo like file:///foo
|
|
# FIXME This doesn't actually strip the hostname correctly.
|
|
if (pattern.startswith('file://') and
|
|
not pattern.startswith('file:///')):
|
|
pattern = 'file:///' + pattern[len("file://"):]
|
|
|
|
return pattern
|
|
|
|
def _init_scheme(self, parsed):
|
|
"""Parse the scheme from the given URL.
|
|
|
|
Deviation from Chromium:
|
|
- We assume * when no scheme has been given.
|
|
"""
|
|
assert parsed.scheme, parsed
|
|
if parsed.scheme == 'any':
|
|
self._scheme = None
|
|
return
|
|
|
|
self._scheme = parsed.scheme
|
|
|
|
def _init_path(self, parsed):
|
|
"""Parse the path from the given URL.
|
|
|
|
Deviation from Chromium:
|
|
- We assume * when no path has been given.
|
|
"""
|
|
if self._scheme == 'about' and not parsed.path.strip():
|
|
raise ParseError("Pattern without path")
|
|
|
|
if parsed.path == '/*':
|
|
self._path = None
|
|
elif parsed.path == '':
|
|
# When the user doesn't add a trailing slash, we assume the pattern
|
|
# matches any path.
|
|
self._path = None
|
|
else:
|
|
self._path = parsed.path
|
|
|
|
def _init_host(self, parsed):
|
|
"""Parse the host from the given URL.
|
|
|
|
Deviation from Chromium:
|
|
- http://:1234/ is not a valid URL because it has no host.
|
|
"""
|
|
if parsed.hostname is None or not parsed.hostname.strip():
|
|
if self._scheme not in self._SCHEMES_WITHOUT_HOST:
|
|
raise ParseError("Pattern without host")
|
|
assert self._host is None
|
|
return
|
|
|
|
# FIXME what about multiple dots?
|
|
host_parts = parsed.hostname.rstrip('.').split('.')
|
|
if host_parts[0] == '*':
|
|
host_parts = host_parts[1:]
|
|
self._match_subdomains = True
|
|
|
|
if not host_parts:
|
|
self._host = None
|
|
return
|
|
|
|
self._host = '.'.join(host_parts)
|
|
|
|
if self._host.endswith('.*'):
|
|
# Special case to have a nicer error
|
|
raise ParseError("TLD wildcards are not implemented yet")
|
|
elif '*' in self._host:
|
|
# Only * or *.foo is allowed as host.
|
|
raise ParseError("Invalid host wildcard")
|
|
|
|
def _init_port(self, parsed):
|
|
"""Parse the port from the given URL.
|
|
|
|
Deviation from Chromium:
|
|
- We use None instead of "*" if there's no port filter.
|
|
"""
|
|
if parsed.netloc.endswith(':*'):
|
|
# We can't access parsed.port as it tries to run int()
|
|
self._port = None
|
|
elif parsed.netloc.endswith(':'):
|
|
raise ParseError("Invalid port: Port is empty")
|
|
else:
|
|
try:
|
|
self._port = parsed.port
|
|
except ValueError as e:
|
|
raise ParseError("Invalid port: {}".format(e))
|
|
|
|
if (self._scheme not in list(self._DEFAULT_PORTS) + [None] and
|
|
self._port is not None):
|
|
raise ParseError("Ports are unsupported with {} scheme".format(
|
|
self._scheme))
|
|
|
|
def _matches_scheme(self, scheme):
|
|
return self._scheme is None or self._scheme == scheme
|
|
|
|
def _matches_host(self, host):
|
|
# FIXME what about multiple dots?
|
|
host = host.rstrip('.')
|
|
|
|
# If we have no host in the match pattern, that means that we're
|
|
# matching all hosts, which means we have a match no matter what the
|
|
# test host is.
|
|
# Contrary to Chromium, we don't need to check for
|
|
# self._match_subdomains, as we want to return True here for e.g.
|
|
# file:// as well.
|
|
if self._host is None:
|
|
return True
|
|
|
|
# If the hosts are exactly equal, we have a match.
|
|
if host == self._host:
|
|
return True
|
|
|
|
# Otherwise, we can only match if our match pattern matches subdomains.
|
|
if not self._match_subdomains:
|
|
return False
|
|
|
|
# We don't do subdomain matching against IP addresses, so we can give
|
|
# up now if the test host is an IP address.
|
|
if not utils.raises(ValueError, ipaddress.ip_address, host):
|
|
return False
|
|
|
|
# Check if the test host is a subdomain of our host.
|
|
if len(host) <= (len(self._host) + 1):
|
|
return False
|
|
|
|
if not host.endswith(self._host):
|
|
return False
|
|
|
|
return host[len(host) - len(self._host) - 1] == '.'
|
|
|
|
def _matches_port(self, scheme, port):
|
|
if port == -1 and scheme in self._DEFAULT_PORTS:
|
|
port = self._DEFAULT_PORTS[scheme]
|
|
return self._port is None or self._port == port
|
|
|
|
def _matches_path(self, path):
|
|
if self._path is None:
|
|
return True
|
|
|
|
# Match 'google.com' with 'google.com/'
|
|
if path + '/*' == self._path:
|
|
return True
|
|
|
|
# FIXME Chromium seems to have a more optimized glob matching which
|
|
# doesn't rely on regexes. Do we need that too?
|
|
return fnmatch.fnmatchcase(path, self._path)
|
|
|
|
def matches(self, qurl):
|
|
"""Check if the pattern matches the given QUrl."""
|
|
qtutils.ensure_valid(qurl)
|
|
|
|
if self._match_all:
|
|
return True
|
|
|
|
if not self._matches_scheme(qurl.scheme()):
|
|
return False
|
|
# FIXME ignore for file:// like Chromium?
|
|
if not self._matches_host(qurl.host()):
|
|
return False
|
|
if not self._matches_port(qurl.scheme(), qurl.port()):
|
|
return False
|
|
if not self._matches_path(qurl.path()):
|
|
return False
|
|
|
|
return True
|