urlmatch: Implement initial matching

2018-02-15 15:39:03 +01:00 · 2018-02-15 15:39:03 +01:00 · 2b274f8e0b
commit 2b274f8e0b
parent 3d6cbcf396
2 changed files with 95 additions and 1 deletions
--- a/qutebrowser/utils/urlmatch.py
+++ b/qutebrowser/utils/urlmatch.py
@ -25,6 +25,7 @@ https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
 https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
 """

+import fnmatch
 import contextlib
 import urllib.parse

@ -114,10 +115,12 @@ class UrlPattern:
            assert self._host is None
            return

-        host_parts = parsed.hostname.split('.')
+        # FIXME what about multiple dots?
+        host_parts = parsed.hostname.rstrip('.').split('.')
        if host_parts[0] == '*':
            host_parts = host_parts[1:]
            self._match_subdomains = True
+
        self._host = '.'.join(host_parts)

        if self._host.endswith('.*'):
@ -153,3 +156,71 @@ class UrlPattern:

    def __repr__(self):
        return utils.get_repr(self, pattern=self._pattern, constructor=True)
+
+    def _matches_scheme(self, scheme):
+        if scheme not in self.SCHEMES:
+            return False
+        return self._scheme == '*' or self._scheme == scheme
+
+    def _matches_host(self, host):
+        # FIXME what about multiple dots?
+        host = host.rstrip('.')
+
+        # If the hosts are exactly equal, we have a match.
+        if host == self._host:
+            return True
+
+        # If we're matching subdomains, and we have no host in the match pattern,
+        # that means that we're matching all hosts, which means we have a match no
+        # matter what the test host is.
+        if self._match_subdomains and not self._host:
+            return True
+
+        # Otherwise, we can only match if our match pattern matches subdomains.
+        if not self._match_subdomains:
+            return False
+
+        # FIXME
+        # We don't do subdomain matching against IP addresses, so we can give up now
+        # if the test host is an IP address.
+        # if (test.HostIsIPAddress())
+        #   return false;
+
+        # Check if the test host is a subdomain of our host.
+        if len(host) <= (len(self._host) + 1):
+            return False
+
+        if not host.endswith(self._host):
+            return False
+
+        return host[len(host) - len(self._host) - 1] == '.'
+
+    def _matches_port(self, port):
+        if port == '-1':  # QUrl
+            port = None
+        return self._port == '*' or self._port == port
+
+    def _matches_path(self, path):
+        # Match 'google.com' with 'google.com/'
+        # FIXME use the no-copy approach Chromium has in URLPattern::MatchesPath
+        # for performance?
+        if path + '/*' == self._path:
+            return True
+
+        # FIXME Chromium seems to have a more optimized glob matching which
+        # doesn't rely on regexes. Do we need that too?
+        return fnmatch.fnmatchcase(path, self._path)
+
+    def matches(self, qurl):
+        """Check if the pattern matches the given QUrl."""
+        # FIXME do we need to check this early?
+        if not self._matches_scheme(qurl.scheme()):
+            return False
+
+        if self._match_all:
+            return True
+
+        # FIXME ignore host for file:// like Chromium?
+        return (self._matches_host(qurl.host()) and
+                self._matches_port(qurl.port()) and
+                self._matches_path(qurl.path()))
--- a/tests/unit/utils/test_urlmatch.py
+++ b/tests/unit/utils/test_urlmatch.py
@ -25,6 +25,8 @@ https://cs.chromium.org/chromium/src/extensions/common/url_pattern_unittest.cc

 import pytest

+from PyQt5.QtCore import QUrl
+
 from qutebrowser.utils import urlmatch


@ -87,3 +89,24 @@ def test_invalid_patterns(pattern, error):
 def test_port(pattern, port):
    up = urlmatch.UrlPattern(pattern)
    assert up._port == port
+
+
+def test_match_all_pages_for_given_scheme_attrs():
+    up = urlmatch.UrlPattern("http://*/*")
+    assert up._scheme == 'http'
+    assert up._host == ''  # FIXME '' or None?
+    assert up._match_subdomains
+    assert not up._match_all
+    assert up._path == '/*'
+
+
+@pytest.mark.parametrize('url, expected', [
+    ("http://google.com", True),
+    ("http://yahoo.com", True),
+    ("http://google.com/foo", True),
+    ("https://google.com", False),
+    ("http://74.125.127.100/search", True),
+])
+def test_match_all_pages_for_given_scheme_urls(url, expected):
+    up = urlmatch.UrlPattern("http://*/*")
+    assert up.matches(QUrl(url)) == expected