[kupfer] desktop_parse: Use shlex for command-line parser

From: Ulrik Sverdrup <usverdrup src gnome org>
To: commits-list gnome org
Cc:
Subject: [kupfer] desktop_parse: Use shlex for command-line parser
Date: Wed, 6 Apr 2011 03:37:18 +0000 (UTC)
commit a85daef026dbaca26292c00c78d57766a42d2d36
Author: Ulrik Sverdrup <ulrik sverdrup gmail com>
Date:   Wed Apr 6 05:37:03 2011 +0200

    desktop_parse: Use shlex for command-line parser
    
    Adapting to reality (awww) we shlex to read .desktop Exec= files. The
    amount of horribly spec-violating .desktop files repoted has been pretty
    large, including the '/usr'/bin/'Crazy Stuff' style quoting or unquoted
    backslashes galore.  Unsurprisingly these files come from rather evil
    sources (Adobe air, Wine, ..).

 kupfer/desktop_parse.py |  179 +++++++++++++++++-----------------------------
 1 files changed, 66 insertions(+), 113 deletions(-)
---
diff --git a/kupfer/desktop_parse.py b/kupfer/desktop_parse.py
index a3c07bb..58d29d4 100644
--- a/kupfer/desktop_parse.py
+++ b/kupfer/desktop_parse.py
@@ -9,7 +9,7 @@ The unescaping we are doing is only one way.. so we unescape according to the
 rules, but we accept everything, if validly quoted or not.
 """
 
-import warnings
+import shlex
 
 # This is the "string" type encoding escapes
 # this is unescaped before we process anything..
@@ -40,12 +40,6 @@ reserved = r""" " ' \ > < ~ | & ; $ * ? # ( ) ` """.split()
 reserved.extend([' ', '\t', '\n'])
 '''
 
-def rmquotes(s):
-	"remove first and last char if we can"
-	if len(s) > 1 and s[0] == s[-1] and s[0] in '"\'':
-		return s[1:-1]
-	return s
-
 def two_part_unescaper(s, reptable):
 	"Scan @s two characters at a time and replace using @reptable"
 	if not s:
@@ -65,92 +59,36 @@ def two_part_unescaper(s, reptable):
 		yield s[-1]
 	return ''.join(_inner())
 
-def quote_scanner(s, reptable):
-	"Scan @s two characters at a time and replace using @reptable"
-	qstr = r'"'
-	eqstr = '\\' + qstr
-
-	parts = []  # A list of arguments
-	preceding_space = False
-	# true if quoted arg is sticky on previous arg
-	should_join_arg = False
-
-	if not s:
-		return parts
-
-	def add_part(is_quoted, part):
-		_ps = "".join(part)
-		if is_quoted:
-			parts.append(two_part_unescaper(rmquotes(_ps), reptable))
-		elif '\\' in _ps:
-			## Here we handle out-of-spec things
-			warnings.warn(RuntimeWarning("Broken unquoted Exec= %s" % repr(s)))
-			## try to split by whitespace, ignore backslash-escaped spaces
-			## insert NUL instead of '\ ' and then split, then reverse
-			space_escaped = two_part_unescaper(_ps, {r'\ ': '\x00'})
-			space_esc_split = space_escaped.split()
-			ps_split = [x.replace('\x00', ' ') for x in space_esc_split]
-			parts.extend([two_part_unescaper(_ps_part, reptable) for _ps_part
-			              in ps_split])
-			## end out-of spec
-		else:
-			parts.extend(_ps.split())
-
-	def merge_last_parts():
-		"merge last two argv parts into one"
-		parts[:] = parts[:-2] + ["".join(parts[-2:])]
-
-	is_quoted = False
-	it = iter(zip(s, s[1:]))
-	part = []
-	for cur, nex in it:
-		part.append(cur)
-		if cur+nex == eqstr:
-			# Skip along if we see an escaped quote (\")
-			part.append(nex)
-			try:
-				it.next()
-			except StopIteration:
-				break
-		elif cur == qstr:
-			if is_quoted:
-				add_part(is_quoted, part)
-				if should_join_arg:
-					merge_last_parts()
-				part = []
-				is_quoted = not is_quoted
-			else:
-				head = part[:-1]
-				if head:
-					add_part(is_quoted, head)
-					part = [part[-1]]
-				is_quoted = not is_quoted
-				## if a quoted string begins without preceding whitespace
-				## we must sticky it on the preceding arg
-				should_join_arg = not preceding_space
-		else:
-			pass
-		preceding_space = cur.isspace()
+def custom_shlex_split(s, comments=False, posix=True):
+	"""
+	Wrapping shlex.split
+	"""
+	if isinstance(s, unicode):
+		is_unicode = True
+		s = s.encode("UTF-8")
 	else:
-		# This is a for-else: we did not 'break'
-		# Emit the last if it wasn't already
-		part.append(s[-1])
-	add_part(is_quoted, part)
-	return parts
-
+		is_unicode = False
+	lex = shlex.shlex(s, posix=posix)
+	lex.whitespace_split = True
+	if not comments:
+		lex.commenters = ''
+	try:
+		lex_output = list(lex)
+	except ValueError:
+		lex_output = [s]
+
+	## extra-unescape  ` and $ that are not handled by shlex
+	quoted_shlex = {r'\`': '`', r'\$':'$'}
+	lex_output[:] = [two_part_unescaper(x, quoted_shlex) for x in lex_output]
+	if is_unicode:
+		return [x.decode("UTF-8") for x in lex_output]
+	else:
+		return lex_output
 
 def unescape(s):
 	"Primary unescape of control sequences"
 	return two_part_unescaper(s, escape_table)
 
-def unquote_inside(s):
-	"unquote reserved chars inside a quoted string"
-	t = {}
-	slash = '\\'
-	for rep in quoted:
-		t[slash+rep] = rep
-	return two_part_unescaper(s, t)
-
 def test_unescape():
 	r"""
 	>>> t = r'"This \\$ \\\\ \s\\\\"'
@@ -161,25 +99,25 @@ def test_unescape():
 	"""
 	pass
 
-def test_unquote_inside():
-	r"""
-	>>> unquote_inside(r'\$ \\ \" \`')
-	'$ \\ " `'
-	>>> unquote_inside(r'abc \q')
-	'abc \\q'
-	"""
-	pass
-
 def parse_argv(instr):
 	r"""
 	Parse quoted @instr into an argv
 
+	This is according to the spec
 	>>> parse_argv('env "VAR=is good" ./program')
 	['env', 'VAR=is good', './program']
 	>>> parse_argv('env "VAR=\\\\ \\$ @ x" ./program')
 	['env', 'VAR=\\ $ @ x', './program']
+	>>> parse_argv('"\\$" "\\`"  "\\""')
+	['$', '`', '"']
+	>>> parse_argv('/usr/bin/x-prog -q %F')
+	['/usr/bin/x-prog', '-q', '%F']
+	>>> parse_argv('env LANG=en_US.UTF-8 freeciv-gtk2')
+	['env', 'LANG=en_US.UTF-8', 'freeciv-gtk2']
+
+	== Below this we need quirks mode ==
 
-	The following style is common but unspecified
+	The following style is common but not supported in spec
 	>>> parse_argv('env VAR="is broken" ./program')
 	['env', 'VAR=is broken', './program']
 
@@ -190,30 +128,45 @@ def parse_argv(instr):
 	The following is just completely broken
 	>>> parse_argv('./program No\\ Space')
 	['./program', 'No Space']
+
+	The following is just insanely broken
+	>>> parse_argv("'/opt'/now/'This is broken/'")
+	['/opt/now/This is broken/']
+
+	This is broken
+	#>>> parse_argv('\\$')
+	#['$']
+	#>>> parse_argv('\\$ \\`  \\"')
+	#['$', '`', '"']
+
+	Unmatched quote, normal mode (just testing that it does not raise)
+	>>> parse_argv('"hi there')
+	['"hi there']
+
+	Unmatched quote, quirks mode (just testing that it does not raise)
+	>>> parse_argv('A\\\\BC "hi there')
+	['A\\\\BC "hi there']
+
 	"""
-	return quote_scanner(instr, quoted_table)
+	return custom_shlex_split(instr)
 
 def parse_unesc_argv(instr):
-	"Parse quoted @instr into an argv after unescaping it"
-	return quote_scanner(unescape(instr), quoted_table)
+	r"""
+	Parse quoted @instr into an argv after unescaping it
 
-'''
-print escaped
-print reserved
+	>>> parse_unesc_argv(r'stuff "C:\\\\suck\\\\start.exe"')
+	['stuff', 'C:\\suck\\start.exe']
 
-t = r'"This \\$ \\\\ \s\\\\"'
-print repr(t)
-print t
-print unescape(t)
-print unquote_inside(rmquotes(unescape(t)))
+	== Below this we need quirks mode ==
 
-print two_part_unescaper(t, escape_table)
+	>>> parse_unesc_argv(r'stuff C:\\\\suck\\\\start.exe')
+	['stuff', 'C:\\suck\\start.exe']
 
-print quote_scanner(r'"hi \"there" I am you\"', inside_table)
-print quote_scanner(r'Now "\"this\" will be interesting"""', inside_table)
-print quote_scanner(unescape(r'"\\$"'), inside_table)
+	>>> parse_unesc_argv("'/usr'/bin/gnome-terminal -x gvim 'Insanely Broken'Yes")
+	['/usr/bin/gnome-terminal', '-x', 'gvim', 'Insanely BrokenYes']
+	"""
+	return custom_shlex_split(unescape(instr))
 
-'''
 
 if __name__ == "__main__":
 	import doctest
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]