shlex.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. """A lexical analyzer class for simple shell-like syntaxes."""
  2. # Module and documentation by Eric S. Raymond, 21 Dec 1998
  3. # Input stacking and error message cleanup added by ESR, March 2000
  4. # push_source() and pop_source() made explicit by ESR, January 2001.
  5. # Posix compliance, split(), string arguments, and
  6. # iterator interface by Gustavo Niemeyer, April 2003.
  7. import os
  8. import re
  9. import sys
  10. from collections import deque
  11. from io import StringIO
  12. __all__ = ["shlex", "split", "quote"]
  13. class shlex:
  14. "A lexical analyzer class for simple shell-like syntaxes."
  15. def __init__(self, instream=None, infile=None, posix=False):
  16. if isinstance(instream, str):
  17. instream = StringIO(instream)
  18. if instream is not None:
  19. self.instream = instream
  20. self.infile = infile
  21. else:
  22. self.instream = sys.stdin
  23. self.infile = None
  24. self.posix = posix
  25. if posix:
  26. self.eof = None
  27. else:
  28. self.eof = ''
  29. self.commenters = '#'
  30. self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
  31. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
  32. if self.posix:
  33. self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
  34. 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
  35. self.whitespace = ' \t\r\n'
  36. self.whitespace_split = False
  37. self.quotes = '\'"'
  38. self.escape = '\\'
  39. self.escapedquotes = '"'
  40. self.state = ' '
  41. self.pushback = deque()
  42. self.lineno = 1
  43. self.debug = 0
  44. self.token = ''
  45. self.filestack = deque()
  46. self.source = None
  47. def push_token(self, tok):
  48. "Push a token onto the stack popped by the get_token method"
  49. if self.debug >= 1:
  50. print("shlex: pushing token " + repr(tok))
  51. self.pushback.appendleft(tok)
  52. def push_source(self, newstream, newfile=None):
  53. "Push an input source onto the lexer's input source stack."
  54. if isinstance(newstream, str):
  55. newstream = StringIO(newstream)
  56. self.filestack.appendleft((self.infile, self.instream, self.lineno))
  57. self.infile = newfile
  58. self.instream = newstream
  59. self.lineno = 1
  60. if self.debug:
  61. if newfile is not None:
  62. print('shlex: pushing to file %s' % (self.infile,))
  63. else:
  64. print('shlex: pushing to stream %s' % (self.instream,))
  65. def pop_source(self):
  66. "Pop the input source stack."
  67. self.instream.close()
  68. (self.infile, self.instream, self.lineno) = self.filestack.popleft()
  69. if self.debug:
  70. print('shlex: popping to %s, line %d' \
  71. % (self.instream, self.lineno))
  72. self.state = ' '
  73. def get_token(self):
  74. "Get a token from the input stream (or from stack if it's nonempty)"
  75. if self.pushback:
  76. tok = self.pushback.popleft()
  77. if self.debug >= 1:
  78. print("shlex: popping token " + repr(tok))
  79. return tok
  80. # No pushback. Get a token.
  81. raw = self.read_token()
  82. # Handle inclusions
  83. if self.source is not None:
  84. while raw == self.source:
  85. spec = self.sourcehook(self.read_token())
  86. if spec:
  87. (newfile, newstream) = spec
  88. self.push_source(newstream, newfile)
  89. raw = self.get_token()
  90. # Maybe we got EOF instead?
  91. while raw == self.eof:
  92. if not self.filestack:
  93. return self.eof
  94. else:
  95. self.pop_source()
  96. raw = self.get_token()
  97. # Neither inclusion nor EOF
  98. if self.debug >= 1:
  99. if raw != self.eof:
  100. print("shlex: token=" + repr(raw))
  101. else:
  102. print("shlex: token=EOF")
  103. return raw
  104. def read_token(self):
  105. quoted = False
  106. escapedstate = ' '
  107. while True:
  108. nextchar = self.instream.read(1)
  109. if nextchar == '\n':
  110. self.lineno = self.lineno + 1
  111. if self.debug >= 3:
  112. print("shlex: in state", repr(self.state), \
  113. "I see character:", repr(nextchar))
  114. if self.state is None:
  115. self.token = '' # past end of file
  116. break
  117. elif self.state == ' ':
  118. if not nextchar:
  119. self.state = None # end of file
  120. break
  121. elif nextchar in self.whitespace:
  122. if self.debug >= 2:
  123. print("shlex: I see whitespace in whitespace state")
  124. if self.token or (self.posix and quoted):
  125. break # emit current token
  126. else:
  127. continue
  128. elif nextchar in self.commenters:
  129. self.instream.readline()
  130. self.lineno = self.lineno + 1
  131. elif self.posix and nextchar in self.escape:
  132. escapedstate = 'a'
  133. self.state = nextchar
  134. elif nextchar in self.wordchars:
  135. self.token = nextchar
  136. self.state = 'a'
  137. elif nextchar in self.quotes:
  138. if not self.posix:
  139. self.token = nextchar
  140. self.state = nextchar
  141. elif self.whitespace_split:
  142. self.token = nextchar
  143. self.state = 'a'
  144. else:
  145. self.token = nextchar
  146. if self.token or (self.posix and quoted):
  147. break # emit current token
  148. else:
  149. continue
  150. elif self.state in self.quotes:
  151. quoted = True
  152. if not nextchar: # end of file
  153. if self.debug >= 2:
  154. print("shlex: I see EOF in quotes state")
  155. # XXX what error should be raised here?
  156. raise ValueError("No closing quotation")
  157. if nextchar == self.state:
  158. if not self.posix:
  159. self.token = self.token + nextchar
  160. self.state = ' '
  161. break
  162. else:
  163. self.state = 'a'
  164. elif self.posix and nextchar in self.escape and \
  165. self.state in self.escapedquotes:
  166. escapedstate = self.state
  167. self.state = nextchar
  168. else:
  169. self.token = self.token + nextchar
  170. elif self.state in self.escape:
  171. if not nextchar: # end of file
  172. if self.debug >= 2:
  173. print("shlex: I see EOF in escape state")
  174. # XXX what error should be raised here?
  175. raise ValueError("No escaped character")
  176. # In posix shells, only the quote itself or the escape
  177. # character may be escaped within quotes.
  178. if escapedstate in self.quotes and \
  179. nextchar != self.state and nextchar != escapedstate:
  180. self.token = self.token + self.state
  181. self.token = self.token + nextchar
  182. self.state = escapedstate
  183. elif self.state == 'a':
  184. if not nextchar:
  185. self.state = None # end of file
  186. break
  187. elif nextchar in self.whitespace:
  188. if self.debug >= 2:
  189. print("shlex: I see whitespace in word state")
  190. self.state = ' '
  191. if self.token or (self.posix and quoted):
  192. break # emit current token
  193. else:
  194. continue
  195. elif nextchar in self.commenters:
  196. self.instream.readline()
  197. self.lineno = self.lineno + 1
  198. if self.posix:
  199. self.state = ' '
  200. if self.token or (self.posix and quoted):
  201. break # emit current token
  202. else:
  203. continue
  204. elif self.posix and nextchar in self.quotes:
  205. self.state = nextchar
  206. elif self.posix and nextchar in self.escape:
  207. escapedstate = 'a'
  208. self.state = nextchar
  209. elif nextchar in self.wordchars or nextchar in self.quotes \
  210. or self.whitespace_split:
  211. self.token = self.token + nextchar
  212. else:
  213. self.pushback.appendleft(nextchar)
  214. if self.debug >= 2:
  215. print("shlex: I see punctuation in word state")
  216. self.state = ' '
  217. if self.token:
  218. break # emit current token
  219. else:
  220. continue
  221. result = self.token
  222. self.token = ''
  223. if self.posix and not quoted and result == '':
  224. result = None
  225. if self.debug > 1:
  226. if result:
  227. print("shlex: raw token=" + repr(result))
  228. else:
  229. print("shlex: raw token=EOF")
  230. return result
  231. def sourcehook(self, newfile):
  232. "Hook called on a filename to be sourced."
  233. if newfile[0] == '"':
  234. newfile = newfile[1:-1]
  235. # This implements cpp-like semantics for relative-path inclusion.
  236. if isinstance(self.infile, str) and not os.path.isabs(newfile):
  237. newfile = os.path.join(os.path.dirname(self.infile), newfile)
  238. return (newfile, open(newfile, "r"))
  239. def error_leader(self, infile=None, lineno=None):
  240. "Emit a C-compiler-like, Emacs-friendly error-message leader."
  241. if infile is None:
  242. infile = self.infile
  243. if lineno is None:
  244. lineno = self.lineno
  245. return "\"%s\", line %d: " % (infile, lineno)
  246. def __iter__(self):
  247. return self
  248. def __next__(self):
  249. token = self.get_token()
  250. if token == self.eof:
  251. raise StopIteration
  252. return token
  253. def split(s, comments=False, posix=True):
  254. lex = shlex(s, posix=posix)
  255. lex.whitespace_split = True
  256. if not comments:
  257. lex.commenters = ''
  258. return list(lex)
  259. _find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
  260. def quote(s):
  261. """Return a shell-escaped version of the string *s*."""
  262. if not s:
  263. return "''"
  264. if _find_unsafe(s) is None:
  265. return s
  266. # use single quotes, and put single quotes into double quotes
  267. # the string $'b is then quoted as '$'"'"'b'
  268. return "'" + s.replace("'", "'\"'\"'") + "'"
  269. def _print_tokens(lexer):
  270. while 1:
  271. tt = lexer.get_token()
  272. if not tt:
  273. break
  274. print("Token: " + repr(tt))
  275. if __name__ == '__main__':
  276. if len(sys.argv) == 1:
  277. _print_tokens(shlex())
  278. else:
  279. fn = sys.argv[1]
  280. with open(fn) as f:
  281. _print_tokens(shlex(f, fn))