qutebrowser/qutebrowser/misc/split.py

# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:

# Copyright 2014-2016 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.

"""Our own fork of shlex.split with some added and removed features."""

import re

from qutebrowser.utils import log


class ShellLexer:

    """A lexical analyzer class for simple shell-like syntaxes.

    Based on Python's shlex, but cleaned up, removed some features, and added
    some features useful for qutebrowser.

    Attributes:
        FIXME
    """

    def __init__(self, s):
        self.string = s
        self.whitespace = ' \t\r'
        self.quotes = '\'"'
        self.escape = '\\'
        self.escapedquotes = '"'
        self.keep = False
        self.quoted = None
        self.escapedstate = None
        self.token = None
        self.state = None
        self.reset()

    def reset(self):
        """Reset the state machine state to the defaults."""
        self.quoted = False
        self.escapedstate = ' '
        self.token = ''
        self.state = ' '

    def __iter__(self):  # pragma: no mccabe
        """Read a raw token from the input stream."""
        # pylint: disable=too-many-branches,too-many-statements
        self.reset()
        for nextchar in self.string:
            if self.state == ' ':
                if self.keep:
                    self.token += nextchar
                if nextchar in self.whitespace:
                    if self.token or self.quoted:
                        yield self.token
                        self.reset()
                elif nextchar in self.escape:
                    self.escapedstate = 'a'
                    self.state = nextchar
                elif nextchar in self.quotes:
                    self.state = nextchar
                else:
                    self.token = nextchar
                    self.state = 'a'
            elif self.state in self.quotes:
                self.quoted = True
                if nextchar == self.state:
                    if self.keep:
                        self.token += nextchar
                    self.state = 'a'
                elif (nextchar in self.escape and
                      self.state in self.escapedquotes):
                    if self.keep:
                        self.token += nextchar
                    self.escapedstate = self.state
                    self.state = nextchar
                else:
                    self.token += nextchar
            elif self.state in self.escape:
                # In posix shells, only the quote itself or the escape
                # character may be escaped within quotes.
                if (self.escapedstate in self.quotes and
                        nextchar != self.state and
                        nextchar != self.escapedstate and not self.keep):
                    self.token += self.state
                self.token += nextchar
                self.state = self.escapedstate
            elif self.state == 'a':
                if nextchar in self.whitespace:
                    self.state = ' '
                    assert self.token or self.quoted
                    yield self.token
                    self.reset()
                    if self.keep:
                        yield nextchar
                elif nextchar in self.quotes:
                    if self.keep:
                        self.token += nextchar
                    self.state = nextchar
                elif nextchar in self.escape:
                    if self.keep:
                        self.token += nextchar
                    self.escapedstate = 'a'
                    self.state = nextchar
                else:
                    self.token += nextchar
            else:
                raise AssertionError("Invalid state {!r}!".format(self.state))
        if self.state in self.escape and not self.keep:
            self.token += self.state
        if self.token or self.quoted:
            yield self.token


def split(s, keep=False):
    """Split a string via ShellLexer.

    Args:
        keep: Whether to keep special chars in the split output.
    """
    lexer = ShellLexer(s)
    lexer.keep = keep
    tokens = list(lexer)
    if not tokens:
        return []
    out = []
    spaces = ""

    log.shlexer.vdebug("{!r} -> {!r}".format(s, tokens))

    for t in tokens:
        if t.isspace():
            spaces += t
        else:
            out.append(spaces + t)
            spaces = ""
    if spaces:
        out.append(spaces)

    return out


def _combine_ws(parts, whitespace):
    """Combine whitespace in a list with the element following it.

    Args:
        parts: A list of strings.
        whitespace: A string containing what's considered whitespace.

    Return:
        The modified list.
    """
    out = []
    ws = ''
    for part in parts:
        if not part:
            continue
        elif part in whitespace:
            ws += part
        else:
            out.append(ws + part)
            ws = ''
    if ws:
        out.append(ws)
    return out


def simple_split(s, keep=False, maxsplit=None):
    """Split a string on whitespace, optionally keeping the whitespace.

    Args:
        s: The string to split.
        keep: Whether to keep whitespace.
        maxsplit: The maximum count of splits.

    Return:
        A list of split strings.
    """
    whitespace = '\n\t '
    if maxsplit == 0:
        # re.split with maxsplit=0 splits everything, while str.split splits
        # nothing (which is the behavior we want).
        if keep:
            return [s]
        else:
            return [s.strip(whitespace)]
    elif maxsplit is None:
        maxsplit = 0

    if keep:
        pattern = '([' + whitespace + '])'
        parts = re.split(pattern, s, maxsplit)
        return _combine_ws(parts, whitespace)
    else:
        pattern = '[' + whitespace + ']'
        parts = re.split(pattern, s, maxsplit)
        parts[-1] = parts[-1].rstrip()
        return [p for p in parts if p]
Initial shlex fork 2014-11-03 21:27:07 +01:00			`# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:`

Adjust copyright years. 2016-01-04 07:12:39 +01:00			`# Copyright 2014-2016 Florian Bruhin (The Compiler) <mail@qutebrowser.org>`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`#`
			`# This file is part of qutebrowser.`
			`#`
			`# qutebrowser is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# qutebrowser is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.`

			`"""Our own fork of shlex.split with some added and removed features."""`

Split commands with split=False correctly. Fixes #231. 2014-11-09 20:46:21 +01:00			`import re`

Use logging for debug 2014-11-04 20:06:58 +01:00			`from qutebrowser.utils import log`

Initial shlex fork 2014-11-03 21:27:07 +01:00
			`class ShellLexer:`
shlex: Some lint cleanup 2014-11-03 21:43:34 +01:00
			`"""A lexical analyzer class for simple shell-like syntaxes.`

			`Based on Python's shlex, but cleaned up, removed some features, and added`
			`some features useful for qutebrowser.`

			`Attributes:`
			`FIXME`
			`"""`

			`def __init__(self, s):`
Use a for loop 2014-11-06 09:02:21 +01:00			`self.string = s`
Remove newline test 2014-11-05 21:44:52 +01:00			`self.whitespace = ' \t\r'`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`self.quotes = '\'"'`
			`self.escape = '\\'`
			`self.escapedquotes = '"'`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`self.keep = False`
Fix lint in split.py 2014-11-09 22:15:44 +01:00			`self.quoted = None`
			`self.escapedstate = None`
			`self.token = None`
			`self.state = None`
			`self.reset()`
Initial shlex fork 2014-11-03 21:27:07 +01:00
Use yield 2014-11-06 06:44:23 +01:00			`def reset(self):`
Various spelling fixes. 2015-03-31 20:49:29 +02:00			`"""Reset the state machine state to the defaults."""`
Use yield 2014-11-06 06:44:23 +01:00			`self.quoted = False`
			`self.escapedstate = ' '`
			`self.token = ''`
			`self.state = ' '`

Switch from flake8 to pytest-{mccabe,flakes,pep8}. 2015-06-02 20:51:06 +02:00			`def __iter__(self): # pragma: no mccabe`
shlex: Some lint cleanup 2014-11-03 21:43:34 +01:00			`"""Read a raw token from the input stream."""`
Fix lint in split.py 2014-11-09 22:15:44 +01:00			`# pylint: disable=too-many-branches,too-many-statements`
shlexer cleanup 2014-11-06 07:13:58 +01:00			`self.reset()`
Use a for loop 2014-11-06 09:02:21 +01:00			`for nextchar in self.string:`
shlexer cleanup 2014-11-06 07:13:58 +01:00			`if self.state == ' ':`
Try splitting with whitespace at next token. 2014-11-06 07:14:36 +01:00			`if self.keep:`
			`self.token += nextchar`
Use yield 2014-11-06 06:44:23 +01:00			`if nextchar in self.whitespace:`
			`if self.token or self.quoted:`
			`yield self.token`
			`self.reset()`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.escape:`
Use yield 2014-11-06 06:44:23 +01:00			`self.escapedstate = 'a'`
			`self.state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`elif nextchar in self.quotes:`
Use yield 2014-11-06 06:44:23 +01:00			`self.state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token = nextchar`
			`self.state = 'a'`
			`elif self.state in self.quotes:`
			`self.quoted = True`
			`if nextchar == self.state:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
			`self.state = 'a'`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif (nextchar in self.escape and`
Various code style improvements 2016-04-27 18:30:54 +02:00			`self.state in self.escapedquotes):`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
			`self.escapedstate = self.state`
			`self.state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
			`elif self.state in self.escape:`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`# In posix shells, only the quote itself or the escape`
			`# character may be escaped within quotes.`
Use yield 2014-11-06 06:44:23 +01:00			`if (self.escapedstate in self.quotes and`
			`nextchar != self.state and`
			`nextchar != self.escapedstate and not self.keep):`
			`self.token += self.state`
			`self.token += nextchar`
			`self.state = self.escapedstate`
			`elif self.state == 'a':`
			`if nextchar in self.whitespace:`
			`self.state = ' '`
Remove dead ShellLexer code. 2015-08-02 13:05:19 +02:00			`assert self.token or self.quoted`
			`yield self.token`
			`self.reset()`
Try splitting with whitespace at next token. 2014-11-06 07:14:36 +01:00			`if self.keep:`
			`yield nextchar`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.quotes:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
			`self.state = nextchar`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.escape:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
			`self.escapedstate = 'a'`
			`self.state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
Use yield 2014-11-06 06:44:23 +01:00			`self.token += nextchar`
Make sure state is valid in ShellLexer. 2015-08-02 12:44:54 +02:00			`else:`
			`raise AssertionError("Invalid state {!r}!".format(self.state))`
Use a for loop 2014-11-06 09:02:21 +01:00			`if self.state in self.escape and not self.keep:`
			`self.token += self.state`
			`if self.token or self.quoted:`
			`yield self.token`
Initial shlex fork 2014-11-03 21:27:07 +01:00

Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`def split(s, keep=False):`
			`"""Split a string via ShellLexer.`

			`Args:`
Fix small typo in docstring 2015-04-09 23:47:25 +02:00			`keep: Whether to keep special chars in the split output.`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`"""`
Handle safe parsing directly in ShellLexer 2014-11-04 20:24:42 +01:00			`lexer = ShellLexer(s)`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`lexer.keep = keep`
Fix some tests 2014-11-05 21:42:27 +01:00			`tokens = list(lexer)`
Use yield 2014-11-06 06:44:23 +01:00			`if not tokens:`
			`return []`
Fix some tests 2014-11-05 21:42:27 +01:00			`out = []`
Try splitting with whitespace at next token. 2014-11-06 07:14:36 +01:00			`spaces = ""`

			`log.shlexer.vdebug("{!r} -> {!r}".format(s, tokens))`

Fix some tests 2014-11-05 21:42:27 +01:00			`for t in tokens:`
			`if t.isspace():`
Try splitting with whitespace at next token. 2014-11-06 07:14:36 +01:00			`spaces += t`
Fix some tests 2014-11-05 21:42:27 +01:00			`else:`
Try splitting with whitespace at next token. 2014-11-06 07:14:36 +01:00			`out.append(spaces + t)`
			`spaces = ""`
Fix splitting of whitespace at EOL 2014-11-06 08:25:46 +01:00			`if spaces:`
			`out.append(spaces)`

Fix some tests 2014-11-05 21:42:27 +01:00			`return out`
Split commands with split=False correctly. Fixes #231. 2014-11-09 20:46:21 +01:00

Clean up simple_split. 2014-11-10 23:02:34 +01:00			`def _combine_ws(parts, whitespace):`
			`"""Combine whitespace in a list with the element following it.`

			`Args:`
			`parts: A list of strings.`
			`whitespace: A string containing what's considered whitespace.`

			`Return:`
			`The modified list.`
			`"""`
			`out = []`
			`ws = ''`
			`for part in parts:`
			`if not part:`
			`continue`
			`elif part in whitespace:`
			`ws += part`
			`else:`
			`out.append(ws + part)`
			`ws = ''`
			`if ws:`
			`out.append(ws)`
			`return out`


Fix splitting with split=False and maxsplit=0. Since re.split behaves differently from str.split with maxsplit=0, 4e9b9baeab425f5313db1c50063b4b9d9f58c89e broke things like ":open foo bar". 2014-11-10 10:38:19 +01:00			`def simple_split(s, keep=False, maxsplit=None):`
Split commands with split=False correctly. Fixes #231. 2014-11-09 20:46:21 +01:00			`"""Split a string on whitespace, optionally keeping the whitespace.`

			`Args:`
			`s: The string to split.`
			`keep: Whether to keep whitespace.`
			`maxsplit: The maximum count of splits.`

			`Return:`
			`A list of split strings.`
			`"""`
			`whitespace = '\n\t '`
Fix splitting with split=False and maxsplit=0. Since re.split behaves differently from str.split with maxsplit=0, 4e9b9baeab425f5313db1c50063b4b9d9f58c89e broke things like ":open foo bar". 2014-11-10 10:38:19 +01:00			`if maxsplit == 0:`
			`# re.split with maxsplit=0 splits everything, while str.split splits`
Various spelling fixes. 2015-03-31 20:49:29 +02:00			`# nothing (which is the behavior we want).`
Fix splitting with split=False and maxsplit=0. Since re.split behaves differently from str.split with maxsplit=0, 4e9b9baeab425f5313db1c50063b4b9d9f58c89e broke things like ":open foo bar". 2014-11-10 10:38:19 +01:00			`if keep:`
			`return [s]`
			`else:`
Fix ws stripping with simple_split and no flags. Fixes #279. 2014-12-12 17:29:01 +01:00			`return [s.strip(whitespace)]`
Fix splitting with split=False and maxsplit=0. Since re.split behaves differently from str.split with maxsplit=0, 4e9b9baeab425f5313db1c50063b4b9d9f58c89e broke things like ":open foo bar". 2014-11-10 10:38:19 +01:00			`elif maxsplit is None:`
			`maxsplit = 0`

Split commands with split=False correctly. Fixes #231. 2014-11-09 20:46:21 +01:00			`if keep:`
			`pattern = '([' + whitespace + '])'`
Clean up simple_split. 2014-11-10 23:02:34 +01:00			`parts = re.split(pattern, s, maxsplit)`
			`return _combine_ws(parts, whitespace)`
Split commands with split=False correctly. Fixes #231. 2014-11-09 20:46:21 +01:00			`else:`
			`pattern = '[' + whitespace + ']'`
Clean up simple_split. 2014-11-10 23:02:34 +01:00			`parts = re.split(pattern, s, maxsplit)`
Remove trailing spaces in simple_split with keep=False. Fixes #279. 2014-11-30 18:59:51 +01:00			`parts[-1] = parts[-1].rstrip()`
Clean up simple_split. 2014-11-10 23:02:34 +01:00			`return [p for p in parts if p]`