qutebrowser/qutebrowser/utils/urlmatch.py

# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:

# Copyright 2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.

"""A Chromium-like URL matching pattern.

See:
https://developer.chrome.com/apps/match_patterns
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
"""

import ipaddress
import fnmatch
import urllib.parse

from qutebrowser.utils import utils, qtutils


class ParseError(Exception):

    """Raised when a pattern could not be parsed."""


class UrlPattern:

    """A Chromium-like URL matching pattern.

    Class attributes:
        _DEFAULT_PORTS: The default ports used for schemes which support ports.
        _SCHEMES_WITHOUT_HOST: Schemes which don't need a host.

    Attributes:
        _pattern: The given pattern as string.
        _match_all: Whether the pattern should match all URLs.
        _match_subdomains: Whether the pattern should match subdomains of the
                           given host.
        _scheme: The scheme to match to, or None to match any scheme.
                 Note that with Chromium, '*'/None only matches http/https and
                 not file/ftp. We deviate from that as per-URL settings aren't
                 security relevant.
        _host: The host to match to, or None for any host.
        _path: The path to match to, or None for any path.
        _port: The port to match to as integer, or None for any port.
    """

    _DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21}
    _SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript']

    def __init__(self, pattern):
        # Make sure all attributes are initialized if we exit early.
        self._pattern = pattern
        self._match_all = False
        self._match_subdomains = False
        self._scheme = None
        self._host = None
        self._path = None
        self._port = None

        # > The special pattern <all_urls> matches any URL that starts with a
        # > permitted scheme.
        if pattern == '<all_urls>':
            self._match_all = True
            return

        if '\0' in pattern:
            raise ParseError("May not contain NUL byte")

        pattern = self._fixup_pattern(pattern)

        # We use urllib.parse instead of QUrl here because it can handle
        # hosts with * in them.
        try:
            parsed = urllib.parse.urlparse(pattern)
        except ValueError as e:
            raise ParseError(str(e))

        assert parsed is not None

        self._init_scheme(parsed)
        self._init_host(parsed)
        self._init_path(parsed)
        self._init_port(parsed)

    def _to_tuple(self):
        """Get a pattern with information used for __eq__/__hash__."""
        return (self._match_all, self._match_subdomains, self._scheme,
                self._host, self._path, self._port)

    def __hash__(self):
        return hash(self._to_tuple())

    def __eq__(self, other):
        if not isinstance(other, UrlPattern):
            return NotImplemented
        # pylint: disable=protected-access
        return self._to_tuple() == other._to_tuple()

    def __repr__(self):
        return utils.get_repr(self, pattern=self._pattern, constructor=True)

    def __str__(self):
        return self._pattern

    def _fixup_pattern(self, pattern):
        """Make sure the given pattern is parseable by urllib.parse."""
        if pattern.startswith('*:'):  # Any scheme, but *:// is unparseable
            pattern = 'any:' + pattern[2:]

        schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST)
        if '://' not in pattern and not pattern.startswith(schemes):
            pattern = 'any://' + pattern

        # Chromium handles file://foo like file:///foo
        # FIXME This doesn't actually strip the hostname correctly.
        if (pattern.startswith('file://') and
                not pattern.startswith('file:///')):
            pattern = 'file:///' + pattern[len("file://"):]

        return pattern

    def _init_scheme(self, parsed):
        """Parse the scheme from the given URL.

        Deviation from Chromium:
        - We assume * when no scheme has been given.
        """
        assert parsed.scheme, parsed
        if parsed.scheme == 'any':
            self._scheme = None
            return

        self._scheme = parsed.scheme

    def _init_path(self, parsed):
        """Parse the path from the given URL.

        Deviation from Chromium:
        - We assume * when no path has been given.
        """
        if self._scheme == 'about' and not parsed.path.strip():
            raise ParseError("Pattern without path")

        if parsed.path == '/*':
            self._path = None
        elif parsed.path == '':
            # When the user doesn't add a trailing slash, we assume the pattern
            # matches any path.
            self._path = None
        else:
            self._path = parsed.path

    def _init_host(self, parsed):
        """Parse the host from the given URL.

        Deviation from Chromium:
        - http://:1234/ is not a valid URL because it has no host.
        """
        if parsed.hostname is None or not parsed.hostname.strip():
            if self._scheme not in self._SCHEMES_WITHOUT_HOST:
                raise ParseError("Pattern without host")
            assert self._host is None
            return

        # FIXME what about multiple dots?
        host_parts = parsed.hostname.rstrip('.').split('.')
        if host_parts[0] == '*':
            host_parts = host_parts[1:]
            self._match_subdomains = True

        if not host_parts:
            self._host = None
            return

        self._host = '.'.join(host_parts)

        if self._host.endswith('.*'):
            # Special case to have a nicer error
            raise ParseError("TLD wildcards are not implemented yet")
        elif '*' in self._host:
            # Only * or *.foo is allowed as host.
            raise ParseError("Invalid host wildcard")

    def _init_port(self, parsed):
        """Parse the port from the given URL.

        Deviation from Chromium:
        - We use None instead of "*" if there's no port filter.
        """
        if parsed.netloc.endswith(':*'):
            # We can't access parsed.port as it tries to run int()
            self._port = None
        elif parsed.netloc.endswith(':'):
            raise ParseError("Invalid port: Port is empty")
        else:
            try:
                self._port = parsed.port
            except ValueError as e:
                raise ParseError("Invalid port: {}".format(e))

        if (self._scheme not in list(self._DEFAULT_PORTS) + [None] and
                self._port is not None):
            raise ParseError("Ports are unsupported with {} scheme".format(
                self._scheme))

    def _matches_scheme(self, scheme):
        return self._scheme is None or self._scheme == scheme

    def _matches_host(self, host):
        # FIXME what about multiple dots?
        host = host.rstrip('.')

        # If we have no host in the match pattern, that means that we're
        # matching all hosts, which means we have a match no matter what the
        # test host is.
        # Contrary to Chromium, we don't need to check for
        # self._match_subdomains, as we want to return True here for e.g.
        # file:// as well.
        if self._host is None:
            return True

        # If the hosts are exactly equal, we have a match.
        if host == self._host:
            return True

        # Otherwise, we can only match if our match pattern matches subdomains.
        if not self._match_subdomains:
            return False

        # We don't do subdomain matching against IP addresses, so we can give
        # up now if the test host is an IP address.
        if not utils.raises(ValueError, ipaddress.ip_address, host):
            return False

        # Check if the test host is a subdomain of our host.
        if len(host) <= (len(self._host) + 1):
            return False

        if not host.endswith(self._host):
            return False

        return host[len(host) - len(self._host) - 1] == '.'

    def _matches_port(self, scheme, port):
        if port == -1 and scheme in self._DEFAULT_PORTS:
            port = self._DEFAULT_PORTS[scheme]
        return self._port is None or self._port == port

    def _matches_path(self, path):
        if self._path is None:
            return True

        # Match 'google.com' with 'google.com/'
        if path + '/*' == self._path:
            return True

        # FIXME Chromium seems to have a more optimized glob matching which
        # doesn't rely on regexes. Do we need that too?
        return fnmatch.fnmatchcase(path, self._path)

    def matches(self, qurl):
        """Check if the pattern matches the given QUrl."""
        qtutils.ensure_valid(qurl)

        if self._match_all:
            return True

        if not self._matches_scheme(qurl.scheme()):
            return False
        # FIXME ignore for file:// like Chromium?
        if not self._matches_host(qurl.host()):
            return False
        if not self._matches_port(qurl.scheme(), qurl.port()):
            return False
        if not self._matches_path(qurl.path()):
            return False

        return True