Merge branch 'shlex'

Fixes #122. Fixes #232.
2014-11-09 18:25:51 +01:00 · 2014-11-09 18:25:51 +01:00 · f689c89316
commit f689c89316
parent 31ee458e56 61943b5a77
8 changed files with 332 additions and 134 deletions
--- a/qutebrowser/commands/runners.py
+++ b/qutebrowser/commands/runners.py
@ -24,7 +24,7 @@ from PyQt5.QtWebKitWidgets import QWebPage

 from qutebrowser.config import config
 from qutebrowser.commands import cmdexc, cmdutils
-from qutebrowser.utils import message, log, utils, objreg
+from qutebrowser.utils import message, log, utils, objreg, split


 def replace_variables(win_id, arglist):
@ -190,7 +190,8 @@ class CommandRunner(QObject):
            new_cmd += ' '
        return new_cmd

-    def parse(self, text, aliases=True, fallback=False, alias_no_args=True):
+    def parse(self, text, aliases=True, fallback=False, alias_no_args=True,
+              keep=False):
        """Split the commandline text into command and arguments.

        Args:
@ -199,18 +200,14 @@ class CommandRunner(QObject):
            fallback: Whether to do a fallback splitting when the command was
                      unknown.
            alias_no_args: Whether to apply an alias if there are no arguments.
+            keep: Whether to keep special chars and whitespace

        Return:
            A split string commandline, e.g ['open', 'www.google.com']
        """
-        parts = text.strip().split(maxsplit=1)
-        if not parts:
+        cmdstr, sep, argstr = text.partition(' ')
+        if not cmdstr:
            raise cmdexc.NoSuchCommandError("No command given")
-        elif len(parts) > 1:
-            cmdstr, argstr = parts
-        else:
-            cmdstr = parts[0]
-            argstr = None
        if aliases:
            new_cmd = self._get_alias(text, alias_no_args)
            if new_cmd is not None:
@ -220,25 +217,35 @@ class CommandRunner(QObject):
            self._cmd = cmdutils.cmd_dict[cmdstr]
        except KeyError:
            if fallback:
-                parts = text.split(' ')
-                if text.endswith(' '):
-                    parts.append('')
-                return parts
+                # FIXME test this
+                cmdstr, sep, argstr = text.partition(' ')
+                return [cmdstr, sep] + argstr.split(' ')
            else:
                raise cmdexc.NoSuchCommandError(
                    '{}: no such command'.format(cmdstr))
-        self._split_args(argstr)
+        self._split_args(argstr, keep)
        retargs = self._args[:]
-        if text.endswith(' '):
-            retargs.append('')
-        return [cmdstr] + retargs
+        if keep and retargs:
+            return [cmdstr, sep + retargs[0]] + retargs[1:]
+        elif keep:
+            return [cmdstr, sep]
+        else:
+            return [cmdstr] + retargs

-    def _split_args(self, argstr):
-        """Split the arguments from an arg string."""
-        if argstr is None:
+    def _split_args(self, argstr, keep):
+        """Split the arguments from an arg string.
+
+        Args:
+            argstr: An argument string.
+            keep: Whether to keep special chars and whitespace
+
+        Return:
+            A list containing the splitted strings.
+        """
+        if not argstr:
            self._args = []
        elif self._cmd.split:
-            self._args = utils.safe_shlex_split(argstr)
+            self._args = split.split(argstr, keep=keep)
        else:
            # If split=False, we still want to split the flags, but not
            # everything after that.
--- a/qutebrowser/test/utils/test_split.py
+++ b/qutebrowser/test/utils/test_split.py
@ -0,0 +1,131 @@
+# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
+
+# Copyright 2014 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
+#
+# This file is part of qutebrowser.
+#
+# qutebrowser is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# qutebrowser is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Tests for qutebrowser.utils.split."""
+
+import unittest
+
+from qutebrowser.utils import split
+
+
+# Most tests copied from Python's shlex.
+# The original test data set was from shellwords, by Hartmut Goebel.
+
+# Format: input/split|output|without|keep/split|output|with|keep/
+
+test_data = r"""
+one two/one|two/one| two/
+one "two three" four/one|two three|four/one| "two three"| four/
+one 'two three' four/one|two three|four/one| 'two three'| four/
+one "two\" three" four/one|two" three|four/one| "two\" three"| four/
+one 'two'\'' three' four/one|two' three|four/one| 'two'\'' three'| four/
+one "two three/one|two three/one| "two three/
+one 'two three/one|two three/one| 'two three/
+one\/one\/one\/
+one "two\/one|two\/one| "two\/
+one /one/one| /
+foo bar/foo|bar/foo| bar/
+ foo bar/foo|bar/ foo| bar/
+ foo bar /foo|bar/ foo| bar| /
+foo   bar    bla     fasel/foo|bar|bla|fasel/foo|   bar|    bla|     fasel/
+x y  z              xxxx/x|y|z|xxxx/x| y|  z|              xxxx/
+\x bar/x|bar/\x| bar/
+\ x bar/ x|bar/\ x| bar/
+\ bar/ bar/\ bar/
+foo \x bar/foo|x|bar/foo| \x| bar/
+foo \ x bar/foo| x|bar/foo| \ x| bar/
+foo \ bar/foo| bar/foo| \ bar/
+foo "bar" bla/foo|bar|bla/foo| "bar"| bla/
+"foo" "bar" "bla"/foo|bar|bla/"foo"| "bar"| "bla"/
+"foo" bar "bla"/foo|bar|bla/"foo"| bar| "bla"/
+"foo" bar bla/foo|bar|bla/"foo"| bar| bla/
+foo 'bar' bla/foo|bar|bla/foo| 'bar'| bla/
+'foo' 'bar' 'bla'/foo|bar|bla/'foo'| 'bar'| 'bla'/
+'foo' bar 'bla'/foo|bar|bla/'foo'| bar| 'bla'/
+'foo' bar bla/foo|bar|bla/'foo'| bar| bla/
+blurb foo"bar"bar"fasel" baz/blurb|foobarbarfasel|baz/blurb| foo"bar"bar"fasel"| baz/
+blurb foo'bar'bar'fasel' baz/blurb|foobarbarfasel|baz/blurb| foo'bar'bar'fasel'| baz/
+""//""/
+''//''/
+foo "" bar/foo||bar/foo| ""| bar/
+foo '' bar/foo||bar/foo| ''| bar/
+foo "" "" "" bar/foo||||bar/foo| ""| ""| ""| bar/
+foo '' '' '' bar/foo||||bar/foo| ''| ''| ''| bar/
+\"/"/\"/
+"\""/"/"\""/
+"foo\ bar"/foo\ bar/"foo\ bar"/
+"foo\\ bar"/foo\ bar/"foo\\ bar"/
+"foo\\ bar\""/foo\ bar"/"foo\\ bar\""/
+"foo\\" bar\"/foo\|bar"/"foo\\"| bar\"/
+"foo\\ bar\" dfadf"/foo\ bar" dfadf/"foo\\ bar\" dfadf"/
+"foo\\\ bar\" dfadf"/foo\\ bar" dfadf/"foo\\\ bar\" dfadf"/
+"foo\\\x bar\" dfadf"/foo\\x bar" dfadf/"foo\\\x bar\" dfadf"/
+"foo\x bar\" dfadf"/foo\x bar" dfadf/"foo\x bar\" dfadf"/
+\'/'/\'/
+'foo\ bar'/foo\ bar/'foo\ bar'/
+'foo\\ bar'/foo\\ bar/'foo\\ bar'/
+"foo\\\x bar\" df'a\ 'df"/foo\\x bar" df'a\ 'df/"foo\\\x bar\" df'a\ 'df"/
+\"foo/"foo/\"foo/
+\"foo\x/"foox/\"foo\x/
+"foo\x"/foo\x/"foo\x"/
+"foo\ "/foo\ /"foo\ "/
+foo\ xx/foo xx/foo\ xx/
+foo\ x\x/foo xx/foo\ x\x/
+foo\ x\x\"/foo xx"/foo\ x\x\"/
+"foo\ x\x"/foo\ x\x/"foo\ x\x"/
+"foo\ x\x\\"/foo\ x\x\/"foo\ x\x\\"/
+"foo\ x\x\\""foobar"/foo\ x\x\foobar/"foo\ x\x\\""foobar"/
+"foo\ x\x\\"\'"foobar"/foo\ x\x\'foobar/"foo\ x\x\\"\'"foobar"/
+"foo\ x\x\\"\'"fo'obar"/foo\ x\x\'fo'obar/"foo\ x\x\\"\'"fo'obar"/
+"foo\ x\x\\"\'"fo'obar" 'don'\''t'/foo\ x\x\'fo'obar|don't/"foo\ x\x\\"\'"fo'obar"| 'don'\''t'/
+"foo\ x\x\\"\'"fo'obar" 'don'\''t' \\/foo\ x\x\'fo'obar|don't|\/"foo\ x\x\\"\'"fo'obar"| 'don'\''t'| \\/
+'foo\ bar'/foo\ bar/'foo\ bar'/
+'foo\\ bar'/foo\\ bar/'foo\\ bar'/
+foo\ bar/foo bar/foo\ bar/
+:-) ;-)/:-)|;-)/:-)| ;-)/
+áéíóú/áéíóú/áéíóú/
+"""
+
+class SplitTests(unittest.TestCase):
+
+    """Test split."""
+
+    def test_split(self):
+        """Test splitting."""
+        for case in test_data.strip().splitlines():
+            cmd, *out = case.split('/')[:-1]
+            with self.subTest(cmd=cmd):
+                items = split.split(cmd)
+                self.assertEqual(items, out[0].split('|'))
+
+    def test_split_keep_original(self):
+        """Test if splitting with keep=True yields the original string."""
+        for case in test_data.strip().splitlines():
+            cmd, *_out = case.split('/')[:-1]
+            with self.subTest(cmd=cmd):
+                items = split.split(cmd, keep=True)
+                self.assertEqual(''.join(items), cmd)
+
+    def test_split_keep(self):
+        """Test splitting with keep=True."""
+        for case in test_data.strip().splitlines():
+            cmd, *out = case.split('/')[:-1]
+            with self.subTest(cmd=cmd):
+                items = split.split(cmd, keep=True)
+                self.assertEqual(items, out[1].split('|'))
--- a/qutebrowser/test/utils/test_utils.py
+++ b/qutebrowser/test/utils/test_utils.py
@ -110,56 +110,6 @@ class DottedGetattrTests(unittest.TestCase):
            _ = utils.dotted_getattr(self, 'test.foo.baz')


-class SafeShlexSplitTests(unittest.TestCase):
-
-    """Test safe_shlex_split."""
-
-    def test_normal(self):
-        """Test safe_shlex_split with a simple string."""
-        items = utils.safe_shlex_split('one two')
-        self.assertEqual(items, ['one', 'two'])
-
-    def test_quoted(self):
-        """Test safe_shlex_split with a normally quoted string."""
-        items = utils.safe_shlex_split('one "two three" four')
-        self.assertEqual(items, ['one', 'two three', 'four'])
-
-    def test_single_quoted(self):
-        """Test safe_shlex_split with a single quoted string."""
-        items = utils.safe_shlex_split("one 'two three' four")
-        self.assertEqual(items, ['one', 'two three', 'four'])
-
-    def test_escaped(self):
-        """Test safe_shlex_split with a normal escaped string."""
-        items = utils.safe_shlex_split(r'one "two\" three" four')
-        self.assertEqual(items, ['one', 'two" three', 'four'])
-
-    def test_escaped_single(self):
-        """Test safe_shlex_split with a single escaped string."""
-        items = utils.safe_shlex_split(r"one 'two'\'' three' four")
-        self.assertEqual(items, ['one', "two' three", 'four'])
-
-    def test_unbalanced_quotes(self):
-        """Test safe_shlex_split with unbalanded quotes."""
-        items = utils.safe_shlex_split(r'one "two three')
-        self.assertEqual(items, ['one', 'two three'])
-
-    def test_unbalanced_single_quotes(self):
-        """Test safe_shlex_split with unbalanded single quotes."""
-        items = utils.safe_shlex_split(r"one 'two three")
-        self.assertEqual(items, ['one', "two three"])
-
-    def test_unfinished_escape(self):
-        """Test safe_shlex_split with an unfinished escape."""
-        items = utils.safe_shlex_split('one\\')
-        self.assertEqual(items, ['one\\'])
-
-    def test_both(self):
-        """Test safe_shlex_split with an unfinished escape and quotes.."""
-        items = utils.safe_shlex_split('one "two\\')
-        self.assertEqual(items, ['one', 'two\\'])
-
-
 class InterpolateColorTests(unittest.TestCase):

    """Tests for interpolate_color.
--- a/qutebrowser/utils/completer.py
+++ b/qutebrowser/utils/completer.py
@ -174,9 +174,12 @@ class Completer(QObject):
        Return:
            A completion model.
        """
-        if parts[cursor_part].startswith('-'):
-            # cursor on a flag
-            return
+        try:
+            if parts[cursor_part].startswith('-'):
+                # cursor on a flag
+                return
+        except IndexError:
+            pass
        parts, cursor_part = self._filter_cmdline_parts(parts, cursor_part)
        if cursor_part == 0:
            # '|' or 'set|'
@ -302,7 +305,10 @@ class Completer(QObject):
                self._parts))
            return

-        pattern = self._parts[self._cursor_part] if self._parts else ''
+        try:
+            pattern = self._parts[self._cursor_part] if self._parts else ''
+        except IndexError:
+            pattern = ''
        self._model().set_pattern(pattern)

        log.completion.debug(
--- a/qutebrowser/utils/log.py
+++ b/qutebrowser/utils/log.py
@ -125,6 +125,7 @@ qt = logging.getLogger('qt')  # Warnings produced by Qt
 style = logging.getLogger('style')
 rfc6266 = logging.getLogger('rfc6266')
 ipc = logging.getLogger('ipc')
+shlexer = logging.getLogger('shlexer')


 ram_handler = None
--- a/qutebrowser/utils/split.py
+++ b/qutebrowser/utils/split.py
@ -0,0 +1,146 @@
+# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
+
+# Copyright 2014 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
+#
+# This file is part of qutebrowser.
+#
+# qutebrowser is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# qutebrowser is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Our own fork of shlex.split with some added and removed features."""
+
+from qutebrowser.utils import log
+
+
+class ShellLexer:
+
+    """A lexical analyzer class for simple shell-like syntaxes.
+
+    Based on Python's shlex, but cleaned up, removed some features, and added
+    some features useful for qutebrowser.
+
+    Attributes:
+        FIXME
+    """
+
+    def __init__(self, s):
+        self.string = s
+        self.whitespace = ' \t\r'
+        self.quotes = '\'"'
+        self.escape = '\\'
+        self.escapedquotes = '"'
+        self.keep = False
+
+    def reset(self):
+        self.quoted = False
+        self.escapedstate = ' '
+        self.token = ''
+        self.state = ' '
+
+    def __iter__(self):
+        """Read a raw token from the input stream."""
+        self.reset()
+        for nextchar in self.string:
+            log.shlexer.vdebug("in state {!r} I see character: {!r}".format(
+                self.state, nextchar))
+            if self.state == ' ':
+                if self.keep:
+                    self.token += nextchar
+                if nextchar in self.whitespace:
+                    log.shlexer.vdebug("I see whitespace in whitespace state")
+                    if self.token or self.quoted:
+                        yield self.token
+                        self.reset()
+                elif nextchar in self.escape:
+                    self.escapedstate = 'a'
+                    self.state = nextchar
+                elif nextchar in self.quotes:
+                    self.state = nextchar
+                else:
+                    self.token = nextchar
+                    self.state = 'a'
+            elif self.state in self.quotes:
+                self.quoted = True
+                if nextchar == self.state:
+                    if self.keep:
+                        self.token += nextchar
+                    self.state = 'a'
+                elif (nextchar in self.escape and
+                        self.state in self.escapedquotes):
+                    if self.keep:
+                        self.token += nextchar
+                    self.escapedstate = self.state
+                    self.state = nextchar
+                else:
+                    self.token += nextchar
+            elif self.state in self.escape:
+                # In posix shells, only the quote itself or the escape
+                # character may be escaped within quotes.
+                if (self.escapedstate in self.quotes and
+                        nextchar != self.state and
+                        nextchar != self.escapedstate and not self.keep):
+                    self.token += self.state
+                self.token += nextchar
+                self.state = self.escapedstate
+            elif self.state == 'a':
+                if nextchar in self.whitespace:
+                    log.shlexer.vdebug("shlex: I see whitespace in word state")
+                    self.state = ' '
+                    if self.token or self.quoted:
+                        yield self.token
+                        self.reset()
+                    if self.keep:
+                        yield nextchar
+                elif nextchar in self.quotes:
+                    if self.keep:
+                        self.token += nextchar
+                    self.state = nextchar
+                elif nextchar in self.escape:
+                    if self.keep:
+                        self.token += nextchar
+                    self.escapedstate = 'a'
+                    self.state = nextchar
+                else:
+                    self.token += nextchar
+        if self.state in self.escape and not self.keep:
+            self.token += self.state
+        if self.token or self.quoted:
+            yield self.token
+
+
+def split(s, keep=False):
+    """Split a string via ShellLexer.
+
+    Args:
+        keep: Whether to keep are special chars in the split output.
+    """
+    lexer = ShellLexer(s)
+    lexer.keep = keep
+    tokens = list(lexer)
+    if not tokens:
+        return []
+    out = []
+    spaces = ""
+
+    log.shlexer.vdebug("{!r} -> {!r}".format(s, tokens))
+
+    for t in tokens:
+        if t.isspace():
+            spaces += t
+        else:
+            out.append(spaces + t)
+            spaces = ""
+    if spaces:
+        out.append(spaces)
+
+    return out
--- a/qutebrowser/utils/utils.py
+++ b/qutebrowser/utils/utils.py
@ -22,7 +22,6 @@
 import io
 import sys
 import enum
-import shlex
 import inspect
 import os.path
 import urllib.request
@ -99,54 +98,6 @@ def dotted_getattr(obj, path):
    return functools.reduce(getattr, path.split('.'), obj)


-def _get_lexer(s):
-    """Get an shlex lexer for safe_shlex_split."""
-    if s is None:
-        raise TypeError("Refusing to create a lexer with s=None!")
-    lexer = shlex.shlex(s, posix=True)
-    lexer.whitespace_split = True
-    lexer.commenters = ''
-    return lexer
-
-
-def safe_shlex_split(s):
-    r"""Split a string via shlex safely (don't bail out on unbalanced quotes).
-
-    We split while the user is typing (for completion), and as
-    soon as ", ' or \ is typed, the string is invalid for shlex,
-    because it encounters EOF while in quote/escape state.
-
-    Here we fix this error temporarily so shlex doesn't blow up,
-    and then retry splitting again.
-
-    Since shlex raises ValueError in both cases we unfortunately
-    have to parse the exception string...
-
-    We try 3 times so multiple errors can be fixed.
-    """
-    orig_s = s
-    for i in range(3):
-        lexer = _get_lexer(s)
-        try:
-            tokens = list(lexer)
-        except ValueError as e:
-            if str(e) not in ("No closing quotation", "No escaped character"):
-                raise
-            # eggs "bacon ham -> eggs "bacon ham"
-            # eggs\ -> eggs\\
-            if lexer.state not in lexer.escape + lexer.quotes:
-                raise AssertionError(
-                    "Lexer state is >{}< while parsing >{}< (attempted fixup: "
-                    ">{}<)".format(lexer.state, orig_s, s))
-            s += lexer.state
-        else:
-            return tokens
-    # We should never arrive here.
-    raise AssertionError(
-        "Gave up splitting >{}< after {} tries. Attempted fixup: >{}<.".format(
-            orig_s, i, s))  # pylint: disable=undefined-loop-variable
-
-
 def pastebin(name, title, text, parent=None):
    """Paste the text into a pastebin and return the URL.

--- a/qutebrowser/widgets/statusbar/command.py
+++ b/qutebrowser/widgets/statusbar/command.py
@ -87,8 +87,12 @@ class Command(misc.MinimalLineEditMixin, misc.CommandLineEdit):
        else:
            return ''

-    def split(self):
-        """Get the text split up in parts."""
+    def split(self, keep=False):
+        """Get the text split up in parts.
+
+        Args:
+            keep: Whether to keep special chars and whitespace.
+        """
        text = self.text()[len(self.prefix()):]
        if not text:
            # When only ":" is entered, we already have one imaginary part,
@ -99,7 +103,8 @@ class Command(misc.MinimalLineEditMixin, misc.CommandLineEdit):
            # the whitespace.
            return [text]
        runner = runners.CommandRunner(self._win_id)
-        parts = runner.parse(text, fallback=True, alias_no_args=False)
+        parts = runner.parse(text, fallback=True, alias_no_args=False,
+                             keep=keep)
        if self._empty_item_idx is not None:
            log.completion.debug("Empty element queued at {}, "
                                 "inserting.".format(self._empty_item_idx))
@ -117,7 +122,7 @@ class Command(misc.MinimalLineEditMixin, misc.CommandLineEdit):
        else:
            spaces = False
        cursor_pos -= len(self.prefix())
-        parts = self.split()
+        parts = self.split(keep=True)
        log.completion.vdebug(
            "text: {}, parts: {}, cursor_pos after removing prefix '{}': "
            "{}".format(self.text(), parts, self.prefix(), cursor_pos))
@ -135,12 +140,10 @@ class Command(misc.MinimalLineEditMixin, misc.CommandLineEdit):
                                      "{}".format(cursor_pos, len(part), i,
                                                  self._empty_item_idx))
                break
-            # FIXME are spaces always 1 char?
-            # https://github.com/The-Compiler/qutebrowser/issues/122
-            cursor_pos -= (len(part) + 1)
+            cursor_pos -= len(part)
            log.completion.vdebug(
-                "Removing len({!r}) + 1 -> {} from cursor_pos -> {}".format(
-                    part, len(part) + 1, cursor_pos))
+                "Removing len({!r}) -> {} from cursor_pos -> {}".format(
+                    part, len(part), cursor_pos))
        log.completion.debug("cursor_part {}, spaces {}".format(
            self._cursor_part, spaces))
        return
@ -211,7 +214,10 @@ class Command(misc.MinimalLineEditMixin, misc.CommandLineEdit):
        parts = self.split()
        log.completion.debug("changing part {} to '{}'".format(
            self._cursor_part, newtext))
-        parts[self._cursor_part] = newtext
+        try:
+            parts[self._cursor_part] = newtext
+        except IndexError:
+            parts.append(newtext)
        # We want to place the cursor directly after the part we just changed.
        cursor_str = self.prefix() + ' '.join(parts[:self._cursor_part + 1])
        if immediate: