qutebrowser/qutebrowser/utils/split.py

# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:

# Copyright 2014 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser.  If not, see <http://www.gnu.org/licenses/>.

"""Our own fork of shlex.split with some added and removed features."""

from qutebrowser.utils import log


class ShellLexer:

    """A lexical analyzer class for simple shell-like syntaxes.

    Based on Python's shlex, but cleaned up, removed some features, and added
    some features useful for qutebrowser.

    Attributes:
        FIXME
    """

    def __init__(self, s):
        self.iterator = iter(s)
        self.whitespace = ' \t\r'
        self.quotes = '\'"'
        self.escape = '\\'
        self.escapedquotes = '"'
        self.keep = False

    def read_token(self):
        """Read a raw token from the input stream."""
        quoted = False
        escapedstate = ' '
        token = ''
        state = ' '
        while True:
            try:
                nextchar = next(self.iterator)
            except StopIteration:
                nextchar = None
            log.shlexer.vdebug("in state {!r} I see character: {!r}".format(
                state, nextchar))
            if state == ' ':
                if nextchar is None:
                    break
                elif nextchar in self.whitespace:
                    log.shlexer.vdebug("I see whitespace in whitespace state")
                    if self.keep:
                        token += nextchar
                    if token or quoted:
                        # emit current token
                        break
                    else:
                        continue
                elif nextchar in self.escape:
                    if self.keep:
                        token += nextchar
                    escapedstate = 'a'
                    state = nextchar
                elif nextchar in self.quotes:
                    if self.keep:
                        token += nextchar
                    state = nextchar
                else:
                    token = nextchar
                    state = 'a'
            elif state in self.quotes:
                quoted = True
                if nextchar is None:
                    log.shlexer.vdebug("I see EOF in quotes state")
                    break
                if nextchar == state:
                    if self.keep:
                        token += nextchar
                    state = 'a'
                elif (nextchar in self.escape and
                        state in self.escapedquotes):
                    if self.keep:
                        token += nextchar
                    escapedstate = state
                    state = nextchar
                else:
                    token += nextchar
            elif state in self.escape:
                if nextchar is None:
                    log.shlexer.vdebug("I see EOF in escape state")
                    if not self.keep:
                        token += state
                    break
                # In posix shells, only the quote itself or the escape
                # character may be escaped within quotes.
                if (escapedstate in self.quotes and nextchar != state and
                        nextchar != escapedstate and not self.keep):
                    token += state
                token += nextchar
                state = escapedstate
            elif state == 'a':
                if nextchar is None:
                    break
                elif nextchar in self.whitespace:
                    log.shlexer.vdebug("shlex: I see whitespace in word state")
                    state = ' '
                    if self.keep:
                        token += nextchar
                    if token or quoted:
                        break   # emit current token
                    else:
                        continue
                elif nextchar in self.quotes:
                    if self.keep:
                        token += nextchar
                    state = nextchar
                elif nextchar in self.escape:
                    if self.keep:
                        token += nextchar
                    escapedstate = 'a'
                    state = nextchar
                else:
                    token += nextchar
        if not quoted and token == '':
            token = None
        log.shlexer.vdebug("token={!r}".format(token))
        return token

    def __iter__(self):
        while True:
            token = self.read_token()
            if token is None:
                return
            else:
                yield token


def split(s, keep=False):
    """Split a string via ShellLexer.

    Args:
        keep: Whether to keep are special chars in the split output.
    """
    lexer = ShellLexer(s)
    lexer.keep = keep
    tokens = list(lexer)
    out = []
    if tokens[0].isspace():
        out.append(tokens[0] + tokens[1])
        tokens = tokens[2:]
    for t in tokens:
        if t.isspace():
            out[-1] += t
        else:
            out.append(t)
    return out
Initial shlex fork 2014-11-03 21:27:07 +01:00			`# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:`

			`# Copyright 2014 Florian Bruhin (The Compiler) <mail@qutebrowser.org>`
			`#`
			`# This file is part of qutebrowser.`
			`#`
			`# qutebrowser is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# qutebrowser is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.`

			`"""Our own fork of shlex.split with some added and removed features."""`

Use logging for debug 2014-11-04 20:06:58 +01:00			`from qutebrowser.utils import log`

Initial shlex fork 2014-11-03 21:27:07 +01:00
			`class ShellLexer:`
shlex: Some lint cleanup 2014-11-03 21:43:34 +01:00
			`"""A lexical analyzer class for simple shell-like syntaxes.`

			`Based on Python's shlex, but cleaned up, removed some features, and added`
			`some features useful for qutebrowser.`

			`Attributes:`
			`FIXME`
			`"""`

			`def __init__(self, s):`
Use iterator to read chars 2014-11-04 20:41:29 +01:00			`self.iterator = iter(s)`
Remove newline test 2014-11-05 21:44:52 +01:00			`self.whitespace = ' \t\r'`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`self.quotes = '\'"'`
			`self.escape = '\\'`
			`self.escapedquotes = '"'`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`self.keep = False`
Initial shlex fork 2014-11-03 21:27:07 +01:00
			`def read_token(self):`
shlex: Some lint cleanup 2014-11-03 21:43:34 +01:00			`"""Read a raw token from the input stream."""`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`quoted = False`
			`escapedstate = ' '`
cleanup 2014-11-05 23:47:48 +01:00			`token = ''`
			`state = ' '`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`while True:`
Use iterator to read chars 2014-11-04 20:41:29 +01:00			`try:`
			`nextchar = next(self.iterator)`
			`except StopIteration:`
			`nextchar = None`
Use logging for debug 2014-11-04 20:06:58 +01:00			`log.shlexer.vdebug("in state {!r} I see character: {!r}".format(`
cleanup 2014-11-05 23:47:48 +01:00			`state, nextchar))`
cleanup state=None 2014-11-05 23:48:57 +01:00			`if state == ' ':`
Use iterator to read chars 2014-11-04 20:41:29 +01:00			`if nextchar is None:`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`break`
			`elif nextchar in self.whitespace:`
Use logging for debug 2014-11-04 20:06:58 +01:00			`log.shlexer.vdebug("I see whitespace in whitespace state")`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`if token or quoted:`
cleanup 2014-11-04 21:06:43 +01:00			`# emit current token`
			`break`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
			`continue`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.escape:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`escapedstate = 'a'`
cleanup 2014-11-05 23:47:48 +01:00			`state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`elif nextchar in self.quotes:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
cleanup 2014-11-05 23:47:48 +01:00			`token = nextchar`
			`state = 'a'`
			`elif state in self.quotes:`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`quoted = True`
cleanup 2014-11-04 21:06:43 +01:00			`if nextchar is None:`
Use logging for debug 2014-11-04 20:06:58 +01:00			`log.shlexer.vdebug("I see EOF in quotes state")`
Handle safe parsing directly in ShellLexer 2014-11-04 20:24:42 +01:00			`break`
cleanup 2014-11-05 23:47:48 +01:00			`if nextchar == state:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`state = 'a'`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif (nextchar in self.escape and`
cleanup 2014-11-05 23:47:48 +01:00			`state in self.escapedquotes):`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`escapedstate = state`
			`state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`elif state in self.escape:`
cleanup 2014-11-04 21:06:43 +01:00			`if nextchar is None:`
Use logging for debug 2014-11-04 20:06:58 +01:00			`log.shlexer.vdebug("I see EOF in escape state")`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if not self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += state`
Handle safe parsing directly in ShellLexer 2014-11-04 20:24:42 +01:00			`break`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`# In posix shells, only the quote itself or the escape`
			`# character may be escaped within quotes.`
cleanup 2014-11-05 23:47:48 +01:00			`if (escapedstate in self.quotes and nextchar != state and`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`nextchar != escapedstate and not self.keep):`
cleanup 2014-11-05 23:47:48 +01:00			`token += state`
			`token += nextchar`
			`state = escapedstate`
			`elif state == 'a':`
Use iterator to read chars 2014-11-04 20:41:29 +01:00			`if nextchar is None:`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`break`
			`elif nextchar in self.whitespace:`
Use logging for debug 2014-11-04 20:06:58 +01:00			`log.shlexer.vdebug("shlex: I see whitespace in word state")`
cleanup 2014-11-05 23:47:48 +01:00			`state = ' '`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`if token or quoted:`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`break # emit current token`
			`else:`
			`continue`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.quotes:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`state = nextchar`
shlex: Assume posix=True 2014-11-03 21:35:47 +01:00			`elif nextchar in self.escape:`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`if self.keep:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`escapedstate = 'a'`
cleanup 2014-11-05 23:47:48 +01:00			`state = nextchar`
Initial shlex fork 2014-11-03 21:27:07 +01:00			`else:`
cleanup 2014-11-05 23:47:48 +01:00			`token += nextchar`
			`if not quoted and token == '':`
			`token = None`
			`log.shlexer.vdebug("token={!r}".format(token))`
			`return token`
Initial shlex fork 2014-11-03 21:27:07 +01:00
			`def __iter__(self):`
cleanup 2014-11-05 23:47:48 +01:00			`while True:`
			`token = self.read_token()`
			`if token is None:`
			`return`
			`else:`
			`yield token`
Initial shlex fork 2014-11-03 21:27:07 +01:00

Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`def split(s, keep=False):`
			`"""Split a string via ShellLexer.`

			`Args:`
			`keep: Whether to keep are special chars in the split output.`
			`"""`
Handle safe parsing directly in ShellLexer 2014-11-04 20:24:42 +01:00			`lexer = ShellLexer(s)`
Add a keep-mode to shlexer. 2014-11-05 07:41:17 +01:00			`lexer.keep = keep`
Fix some tests 2014-11-05 21:42:27 +01:00			`tokens = list(lexer)`
			`out = []`
			`if tokens[0].isspace():`
			`out.append(tokens[0] + tokens[1])`
			`tokens = tokens[2:]`
			`for t in tokens:`
			`if t.isspace():`
			`out[-1] += t`
			`else:`
			`out.append(t)`
			`return out`