urlmatch: Implement initial matching

This commit is contained in:
Florian Bruhin 2018-02-15 15:39:03 +01:00
parent 3d6cbcf396
commit 2b274f8e0b
2 changed files with 95 additions and 1 deletions

View File

@ -25,6 +25,7 @@ https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
"""
import fnmatch
import contextlib
import urllib.parse
@ -114,10 +115,12 @@ class UrlPattern:
assert self._host is None
return
host_parts = parsed.hostname.split('.')
# FIXME what about multiple dots?
host_parts = parsed.hostname.rstrip('.').split('.')
if host_parts[0] == '*':
host_parts = host_parts[1:]
self._match_subdomains = True
self._host = '.'.join(host_parts)
if self._host.endswith('.*'):
@ -153,3 +156,71 @@ class UrlPattern:
def __repr__(self):
return utils.get_repr(self, pattern=self._pattern, constructor=True)
def _matches_scheme(self, scheme):
if scheme not in self.SCHEMES:
return False
return self._scheme == '*' or self._scheme == scheme
def _matches_host(self, host):
# FIXME what about multiple dots?
host = host.rstrip('.')
# If the hosts are exactly equal, we have a match.
if host == self._host:
return True
# If we're matching subdomains, and we have no host in the match pattern,
# that means that we're matching all hosts, which means we have a match no
# matter what the test host is.
if self._match_subdomains and not self._host:
return True
# Otherwise, we can only match if our match pattern matches subdomains.
if not self._match_subdomains:
return False
# FIXME
# We don't do subdomain matching against IP addresses, so we can give up now
# if the test host is an IP address.
# if (test.HostIsIPAddress())
# return false;
# Check if the test host is a subdomain of our host.
if len(host) <= (len(self._host) + 1):
return False
if not host.endswith(self._host):
return False
return host[len(host) - len(self._host) - 1] == '.'
def _matches_port(self, port):
if port == '-1': # QUrl
port = None
return self._port == '*' or self._port == port
def _matches_path(self, path):
# Match 'google.com' with 'google.com/'
# FIXME use the no-copy approach Chromium has in URLPattern::MatchesPath
# for performance?
if path + '/*' == self._path:
return True
# FIXME Chromium seems to have a more optimized glob matching which
# doesn't rely on regexes. Do we need that too?
return fnmatch.fnmatchcase(path, self._path)
def matches(self, qurl):
"""Check if the pattern matches the given QUrl."""
# FIXME do we need to check this early?
if not self._matches_scheme(qurl.scheme()):
return False
if self._match_all:
return True
# FIXME ignore host for file:// like Chromium?
return (self._matches_host(qurl.host()) and
self._matches_port(qurl.port()) and
self._matches_path(qurl.path()))

View File

@ -25,6 +25,8 @@ https://cs.chromium.org/chromium/src/extensions/common/url_pattern_unittest.cc
import pytest
from PyQt5.QtCore import QUrl
from qutebrowser.utils import urlmatch
@ -87,3 +89,24 @@ def test_invalid_patterns(pattern, error):
def test_port(pattern, port):
up = urlmatch.UrlPattern(pattern)
assert up._port == port
def test_match_all_pages_for_given_scheme_attrs():
up = urlmatch.UrlPattern("http://*/*")
assert up._scheme == 'http'
assert up._host == '' # FIXME '' or None?
assert up._match_subdomains
assert not up._match_all
assert up._path == '/*'
@pytest.mark.parametrize('url, expected', [
("http://google.com", True),
("http://yahoo.com", True),
("http://google.com/foo", True),
("https://google.com", False),
("http://74.125.127.100/search", True),
])
def test_match_all_pages_for_given_scheme_urls(url, expected):
up = urlmatch.UrlPattern("http://*/*")
assert up.matches(QUrl(url)) == expected