From 9ac2dbcc80330c6090ecdce656046931e8cf591b Mon Sep 17 00:00:00 2001
From: Florian Bruhin <git@the-compiler.org>
Date: Tue, 4 Jul 2017 10:16:27 +0200
Subject: [PATCH] Disallow surrogate escapes in dicts and lists in the config

In Dict.to_str() and List.to_str() we use json.dump to get a value. However,
JSON includes surrogate escapes in the dumped values, which breaks round trips.

>>> yaml.load(json.dumps({'\U00010000': True}))
{'\ud800\udc00': True}

>>> yaml.load(json.dumps({'\U00010000': True}, ensure_ascii=False))
yaml.reader.ReaderError: unacceptable character #x10000: special characters are not allowed

See:
https://stackoverflow.com/a/38552626/2085149
https://news.ycombinator.com/item?id=12798032
---
 qutebrowser/config/configtypes.py     | 17 +++++++++++++++++
 tests/unit/config/test_configtypes.py |  6 +++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/qutebrowser/config/configtypes.py b/qutebrowser/config/configtypes.py
index e266bd38a..ca01cfd60 100644
--- a/qutebrowser/config/configtypes.py
+++ b/qutebrowser/config/configtypes.py
@@ -181,6 +181,17 @@ class BaseType:
             raise configexc.ValidationError(
                 value, "may not contain unprintable chars!")
 
+    def _validate_surrogate_escapes(self, full_value, value):
+        """Make sure the given value doesn't contain surrogate escapes.
+
+        This is used for values passed to json.dump, as it can't handle those.
+        """
+        if not isinstance(value, str):
+            return
+        if any(ord(c) > 0xFFFF for c in value):
+            raise configexc.ValidationError(
+                full_value, "may not contain surrogate escapes!")
+
     def _validate_valid_values(self, value):
         """Validate value against possible values.
 
@@ -418,6 +429,9 @@ class List(BaseType):
         if not value:
             return []
 
+        for val in value:
+            self._validate_surrogate_escapes(value, val)
+
         if self.length is not None and len(value) != self.length:
             raise configexc.ValidationError(value, "Exactly {} values need to "
                                             "be set!".format(self.length))
@@ -1089,6 +1103,9 @@ class Dict(BaseType):
             return self._fill_fixed_keys({})
 
         self._validate_keys(value)
+        for key, val in value.items():
+            self._validate_surrogate_escapes(value, key)
+            self._validate_surrogate_escapes(value, val)
 
         d = {self.keytype.to_py(key): self.valtype.to_py(val)
              for key, val in value.items()}
diff --git a/tests/unit/config/test_configtypes.py b/tests/unit/config/test_configtypes.py
index 05f0f1922..9aa91b372 100644
--- a/tests/unit/config/test_configtypes.py
+++ b/tests/unit/config/test_configtypes.py
@@ -429,6 +429,8 @@ class TestString:
         ({'minlen': 2, 'maxlen': 3}, 'abc'),
         # valid_values
         ({'valid_values': configtypes.ValidValues('abcd')}, 'abcd'),
+        # Surrogate escapes are allowed in strings
+        ({}, '\U00010000'),
     ])
     def test_to_py(self, klass, kwargs, val):
         assert klass(**kwargs).to_py(val) == val
@@ -535,7 +537,7 @@ class TestList:
     def test_to_py(self, klass, val):
         assert klass().to_py(val) == val
 
-    @pytest.mark.parametrize('val', [[42], '["foo"]'])
+    @pytest.mark.parametrize('val', [[42], '["foo"]', ['\U00010000']])
     def test_to_py_invalid(self, klass, val):
         with pytest.raises(configexc.ValidationError):
             klass().to_py(val)
@@ -1427,6 +1429,8 @@ class TestDict:
         assert klass(keytype=keytype, valtype=valtype).to_py(val) == val
 
     @pytest.mark.parametrize('val', [
+        {'\U00010000': 'foo'},  # UTF-16 surrogate in key
+        {'foo': '\U00010000'},  # UTF-16 surrogate in value
         {0: 'foo'},  # Invalid key type
         {'foo': 0},  # Invalid value type
     ])