From 3fa8efc34b089ef6a4b89012eda4bba714d1f776 Mon Sep 17 00:00:00 2001 From: Florian Bruhin Date: Mon, 3 Nov 2014 21:27:07 +0100 Subject: [PATCH] Initial shlex fork --- qutebrowser/commands/runners.py | 4 +- qutebrowser/test/utils/test_split.py | 76 ++++++ qutebrowser/test/utils/test_utils.py | 50 ---- qutebrowser/utils/split.py | 331 +++++++++++++++++++++++++++ qutebrowser/utils/utils.py | 49 ---- 5 files changed, 409 insertions(+), 101 deletions(-) create mode 100644 qutebrowser/test/utils/test_split.py create mode 100644 qutebrowser/utils/split.py diff --git a/qutebrowser/commands/runners.py b/qutebrowser/commands/runners.py index c28c0b96f..f053a710b 100644 --- a/qutebrowser/commands/runners.py +++ b/qutebrowser/commands/runners.py @@ -24,7 +24,7 @@ from PyQt5.QtWebKitWidgets import QWebPage from qutebrowser.config import config from qutebrowser.commands import cmdexc, cmdutils -from qutebrowser.utils import message, log, utils, objreg +from qutebrowser.utils import message, log, utils, objreg, split def replace_variables(win_id, arglist): @@ -238,7 +238,7 @@ class CommandRunner(QObject): if argstr is None: self._args = [] elif self._cmd.split: - self._args = utils.safe_shlex_split(argstr) + self._args = split.split(argstr) else: # If split=False, we still want to split the flags, but not # everything after that. diff --git a/qutebrowser/test/utils/test_split.py b/qutebrowser/test/utils/test_split.py new file mode 100644 index 000000000..139af6c86 --- /dev/null +++ b/qutebrowser/test/utils/test_split.py @@ -0,0 +1,76 @@ +# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: + +# Copyright 2014 Florian Bruhin (The Compiler) +# +# This file is part of qutebrowser. +# +# qutebrowser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# qutebrowser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with qutebrowser. If not, see . + +"""Tests for qutebrowser.utils.split.""" + +import unittest + +from qutebrowser.utils import split + + +class SplitTests(unittest.TestCase): + + """Test split.""" + + def test_normal(self): + """Test split with a simple string.""" + items = split.split('one two') + self.assertEqual(items, ['one', 'two']) + + def test_quoted(self): + """Test split with a normally quoted string.""" + items = split.split('one "two three" four') + self.assertEqual(items, ['one', 'two three', 'four']) + + def test_single_quoted(self): + """Test split with a single quoted string.""" + items = split.split("one 'two three' four") + self.assertEqual(items, ['one', 'two three', 'four']) + + def test_escaped(self): + """Test split with a normal escaped string.""" + items = split.split(r'one "two\" three" four') + self.assertEqual(items, ['one', 'two" three', 'four']) + + def test_escaped_single(self): + """Test split with a single escaped string.""" + items = split.split(r"one 'two'\'' three' four") + self.assertEqual(items, ['one', "two' three", 'four']) + + def test_unbalanced_quotes(self): + """Test split with unbalanded quotes.""" + items = split.split(r'one "two three') + self.assertEqual(items, ['one', 'two three']) + + def test_unbalanced_single_quotes(self): + """Test split with unbalanded single quotes.""" + items = split.split(r"one 'two three") + self.assertEqual(items, ['one', "two three"]) + + def test_unfinished_escape(self): + """Test split with an unfinished escape.""" + items = split.split('one\\') + self.assertEqual(items, ['one\\']) + + def test_both(self): + """Test split with an unfinished escape and quotes..""" + items = split.split('one "two\\') + self.assertEqual(items, ['one', 'two\\']) + + diff --git a/qutebrowser/test/utils/test_utils.py b/qutebrowser/test/utils/test_utils.py index 470cd449a..6b8ad1553 100644 --- a/qutebrowser/test/utils/test_utils.py +++ b/qutebrowser/test/utils/test_utils.py @@ -110,56 +110,6 @@ class DottedGetattrTests(unittest.TestCase): _ = utils.dotted_getattr(self, 'test.foo.baz') -class SafeShlexSplitTests(unittest.TestCase): - - """Test safe_shlex_split.""" - - def test_normal(self): - """Test safe_shlex_split with a simple string.""" - items = utils.safe_shlex_split('one two') - self.assertEqual(items, ['one', 'two']) - - def test_quoted(self): - """Test safe_shlex_split with a normally quoted string.""" - items = utils.safe_shlex_split('one "two three" four') - self.assertEqual(items, ['one', 'two three', 'four']) - - def test_single_quoted(self): - """Test safe_shlex_split with a single quoted string.""" - items = utils.safe_shlex_split("one 'two three' four") - self.assertEqual(items, ['one', 'two three', 'four']) - - def test_escaped(self): - """Test safe_shlex_split with a normal escaped string.""" - items = utils.safe_shlex_split(r'one "two\" three" four') - self.assertEqual(items, ['one', 'two" three', 'four']) - - def test_escaped_single(self): - """Test safe_shlex_split with a single escaped string.""" - items = utils.safe_shlex_split(r"one 'two'\'' three' four") - self.assertEqual(items, ['one', "two' three", 'four']) - - def test_unbalanced_quotes(self): - """Test safe_shlex_split with unbalanded quotes.""" - items = utils.safe_shlex_split(r'one "two three') - self.assertEqual(items, ['one', 'two three']) - - def test_unbalanced_single_quotes(self): - """Test safe_shlex_split with unbalanded single quotes.""" - items = utils.safe_shlex_split(r"one 'two three") - self.assertEqual(items, ['one', "two three"]) - - def test_unfinished_escape(self): - """Test safe_shlex_split with an unfinished escape.""" - items = utils.safe_shlex_split('one\\') - self.assertEqual(items, ['one\\']) - - def test_both(self): - """Test safe_shlex_split with an unfinished escape and quotes..""" - items = utils.safe_shlex_split('one "two\\') - self.assertEqual(items, ['one', 'two\\']) - - class InterpolateColorTests(unittest.TestCase): """Tests for interpolate_color. diff --git a/qutebrowser/utils/split.py b/qutebrowser/utils/split.py new file mode 100644 index 000000000..b201aeef2 --- /dev/null +++ b/qutebrowser/utils/split.py @@ -0,0 +1,331 @@ +# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: + +# Copyright 2014 Florian Bruhin (The Compiler) +# +# This file is part of qutebrowser. +# +# qutebrowser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# qutebrowser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with qutebrowser. If not, see . + +"""Our own fork of shlex.split with some added and removed features.""" + +import os +import re +import sys +from collections import deque + +from io import StringIO + + +class ShellLexer: + "A lexical analyzer class for simple shell-like syntaxes." + def __init__(self, instream=None, infile=None, posix=False): + if isinstance(instream, str): + instream = StringIO(instream) + if instream is not None: + self.instream = instream + self.infile = infile + else: + self.instream = sys.stdin + self.infile = None + self.posix = posix + if posix: + self.eof = None + else: + self.eof = '' + self.commenters = '#' + self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') + if self.posix: + self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' + 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') + self.whitespace = ' \t\r\n' + self.whitespace_split = False + self.quotes = '\'"' + self.escape = '\\' + self.escapedquotes = '"' + self.state = ' ' + self.pushback = deque() + self.lineno = 1 + self.debug = 0 + self.token = '' + self.filestack = deque() + self.source = None + if self.debug: + print('shlex: reading from %s, line %d' \ + % (self.instream, self.lineno)) + + def push_token(self, tok): + "Push a token onto the stack popped by the get_token method" + if self.debug >= 1: + print("shlex: pushing token " + repr(tok)) + self.pushback.appendleft(tok) + + def push_source(self, newstream, newfile=None): + "Push an input source onto the lexer's input source stack." + if isinstance(newstream, str): + newstream = StringIO(newstream) + self.filestack.appendleft((self.infile, self.instream, self.lineno)) + self.infile = newfile + self.instream = newstream + self.lineno = 1 + if self.debug: + if newfile is not None: + print('shlex: pushing to file %s' % (self.infile,)) + else: + print('shlex: pushing to stream %s' % (self.instream,)) + + def pop_source(self): + "Pop the input source stack." + self.instream.close() + (self.infile, self.instream, self.lineno) = self.filestack.popleft() + if self.debug: + print('shlex: popping to %s, line %d' \ + % (self.instream, self.lineno)) + self.state = ' ' + + def get_token(self): + "Get a token from the input stream (or from stack if it's nonempty)" + if self.pushback: + tok = self.pushback.popleft() + if self.debug >= 1: + print("shlex: popping token " + repr(tok)) + return tok + # No pushback. Get a token. + raw = self.read_token() + # Handle inclusions + if self.source is not None: + while raw == self.source: + spec = self.sourcehook(self.read_token()) + if spec: + (newfile, newstream) = spec + self.push_source(newstream, newfile) + raw = self.get_token() + # Maybe we got EOF instead? + while raw == self.eof: + if not self.filestack: + return self.eof + else: + self.pop_source() + raw = self.get_token() + # Neither inclusion nor EOF + if self.debug >= 1: + if raw != self.eof: + print("shlex: token=" + repr(raw)) + else: + print("shlex: token=EOF") + return raw + + def read_token(self): + quoted = False + escapedstate = ' ' + while True: + nextchar = self.instream.read(1) + if nextchar == '\n': + self.lineno = self.lineno + 1 + if self.debug >= 3: + print("shlex: in state", repr(self.state), \ + "I see character:", repr(nextchar)) + if self.state is None: + self.token = '' # past end of file + break + elif self.state == ' ': + if not nextchar: + self.state = None # end of file + break + elif nextchar in self.whitespace: + if self.debug >= 2: + print("shlex: I see whitespace in whitespace state") + if self.token or (self.posix and quoted): + break # emit current token + else: + continue + elif nextchar in self.commenters: + self.instream.readline() + self.lineno = self.lineno + 1 + elif self.posix and nextchar in self.escape: + escapedstate = 'a' + self.state = nextchar + elif nextchar in self.wordchars: + self.token = nextchar + self.state = 'a' + elif nextchar in self.quotes: + if not self.posix: + self.token = nextchar + self.state = nextchar + elif self.whitespace_split: + self.token = nextchar + self.state = 'a' + else: + self.token = nextchar + if self.token or (self.posix and quoted): + break # emit current token + else: + continue + elif self.state in self.quotes: + quoted = True + if not nextchar: # end of file + if self.debug >= 2: + print("shlex: I see EOF in quotes state") + # XXX what error should be raised here? + raise ValueError("No closing quotation") + if nextchar == self.state: + if not self.posix: + self.token = self.token + nextchar + self.state = ' ' + break + else: + self.state = 'a' + elif self.posix and nextchar in self.escape and \ + self.state in self.escapedquotes: + escapedstate = self.state + self.state = nextchar + else: + self.token = self.token + nextchar + elif self.state in self.escape: + if not nextchar: # end of file + if self.debug >= 2: + print("shlex: I see EOF in escape state") + # XXX what error should be raised here? + raise ValueError("No escaped character") + # In posix shells, only the quote itself or the escape + # character may be escaped within quotes. + if escapedstate in self.quotes and \ + nextchar != self.state and nextchar != escapedstate: + self.token = self.token + self.state + self.token = self.token + nextchar + self.state = escapedstate + elif self.state == 'a': + if not nextchar: + self.state = None # end of file + break + elif nextchar in self.whitespace: + if self.debug >= 2: + print("shlex: I see whitespace in word state") + self.state = ' ' + if self.token or (self.posix and quoted): + break # emit current token + else: + continue + elif nextchar in self.commenters: + self.instream.readline() + self.lineno = self.lineno + 1 + if self.posix: + self.state = ' ' + if self.token or (self.posix and quoted): + break # emit current token + else: + continue + elif self.posix and nextchar in self.quotes: + self.state = nextchar + elif self.posix and nextchar in self.escape: + escapedstate = 'a' + self.state = nextchar + elif nextchar in self.wordchars or nextchar in self.quotes \ + or self.whitespace_split: + self.token = self.token + nextchar + else: + self.pushback.appendleft(nextchar) + if self.debug >= 2: + print("shlex: I see punctuation in word state") + self.state = ' ' + if self.token: + break # emit current token + else: + continue + result = self.token + self.token = '' + if self.posix and not quoted and result == '': + result = None + if self.debug > 1: + if result: + print("shlex: raw token=" + repr(result)) + else: + print("shlex: raw token=EOF") + return result + + def sourcehook(self, newfile): + "Hook called on a filename to be sourced." + if newfile[0] == '"': + newfile = newfile[1:-1] + # This implements cpp-like semantics for relative-path inclusion. + if isinstance(self.infile, str) and not os.path.isabs(newfile): + newfile = os.path.join(os.path.dirname(self.infile), newfile) + return (newfile, open(newfile, "r")) + + def error_leader(self, infile=None, lineno=None): + "Emit a C-compiler-like, Emacs-friendly error-message leader." + if infile is None: + infile = self.infile + if lineno is None: + lineno = self.lineno + return "\"%s\", line %d: " % (infile, lineno) + + def __iter__(self): + return self + + def __next__(self): + token = self.get_token() + if token == self.eof: + raise StopIteration + return token + + +def _get_lexer(s): + """Get an shlex lexer for split.""" + if s is None: + raise TypeError("Refusing to create a lexer with s=None!") + lexer = ShellLexer(s, posix=True) + lexer.whitespace_split = True + lexer.commenters = '' + return lexer + + +def split(s): + r"""Split a string via shlex safely (don't bail out on unbalanced quotes). + + We split while the user is typing (for completion), and as + soon as ", ' or \ is typed, the string is invalid for shlex, + because it encounters EOF while in quote/escape state. + + Here we fix this error temporarily so shlex doesn't blow up, + and then retry splitting again. + + Since shlex raises ValueError in both cases we unfortunately + have to parse the exception string... + + We try 3 times so multiple errors can be fixed. + """ + orig_s = s + for i in range(3): + lexer = _get_lexer(s) + try: + tokens = list(lexer) + except ValueError as e: + if str(e) not in ("No closing quotation", "No escaped character"): + raise + # eggs "bacon ham -> eggs "bacon ham" + # eggs\ -> eggs\\ + if lexer.state not in lexer.escape + lexer.quotes: + raise AssertionError( + "Lexer state is >{}< while parsing >{}< (attempted fixup: " + ">{}<)".format(lexer.state, orig_s, s)) + s += lexer.state + else: + return tokens + # We should never arrive here. + raise AssertionError( + "Gave up splitting >{}< after {} tries. Attempted fixup: >{}<.".format( + orig_s, i, s)) # pylint: disable=undefined-loop-variable + + diff --git a/qutebrowser/utils/utils.py b/qutebrowser/utils/utils.py index 3a96bc09d..1dd13ee77 100644 --- a/qutebrowser/utils/utils.py +++ b/qutebrowser/utils/utils.py @@ -22,7 +22,6 @@ import io import sys import enum -import shlex import inspect import os.path import urllib.request @@ -99,54 +98,6 @@ def dotted_getattr(obj, path): return functools.reduce(getattr, path.split('.'), obj) -def _get_lexer(s): - """Get an shlex lexer for safe_shlex_split.""" - if s is None: - raise TypeError("Refusing to create a lexer with s=None!") - lexer = shlex.shlex(s, posix=True) - lexer.whitespace_split = True - lexer.commenters = '' - return lexer - - -def safe_shlex_split(s): - r"""Split a string via shlex safely (don't bail out on unbalanced quotes). - - We split while the user is typing (for completion), and as - soon as ", ' or \ is typed, the string is invalid for shlex, - because it encounters EOF while in quote/escape state. - - Here we fix this error temporarily so shlex doesn't blow up, - and then retry splitting again. - - Since shlex raises ValueError in both cases we unfortunately - have to parse the exception string... - - We try 3 times so multiple errors can be fixed. - """ - orig_s = s - for i in range(3): - lexer = _get_lexer(s) - try: - tokens = list(lexer) - except ValueError as e: - if str(e) not in ("No closing quotation", "No escaped character"): - raise - # eggs "bacon ham -> eggs "bacon ham" - # eggs\ -> eggs\\ - if lexer.state not in lexer.escape + lexer.quotes: - raise AssertionError( - "Lexer state is >{}< while parsing >{}< (attempted fixup: " - ">{}<)".format(lexer.state, orig_s, s)) - s += lexer.state - else: - return tokens - # We should never arrive here. - raise AssertionError( - "Gave up splitting >{}< after {} tries. Attempted fixup: >{}<.".format( - orig_s, i, s)) # pylint: disable=undefined-loop-variable - - def pastebin(name, title, text, parent=None): """Paste the text into a pastebin and return the URL.