sre_parse.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. from sre_constants import *
  13. SPECIAL_CHARS = ".\\[{()*+?^$|"
  14. REPEAT_CHARS = "*+?{"
  15. DIGITS = frozenset("0123456789")
  16. OCTDIGITS = frozenset("01234567")
  17. HEXDIGITS = frozenset("0123456789abcdefABCDEF")
  18. ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  19. WHITESPACE = frozenset(" \t\n\r\v\f")
  20. _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  21. _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  22. ESCAPES = {
  23. r"\a": (LITERAL, ord("\a")),
  24. r"\b": (LITERAL, ord("\b")),
  25. r"\f": (LITERAL, ord("\f")),
  26. r"\n": (LITERAL, ord("\n")),
  27. r"\r": (LITERAL, ord("\r")),
  28. r"\t": (LITERAL, ord("\t")),
  29. r"\v": (LITERAL, ord("\v")),
  30. r"\\": (LITERAL, ord("\\"))
  31. }
  32. CATEGORIES = {
  33. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  34. r"\b": (AT, AT_BOUNDARY),
  35. r"\B": (AT, AT_NON_BOUNDARY),
  36. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  37. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  38. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  39. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  40. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  41. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  42. r"\Z": (AT, AT_END_STRING), # end of string
  43. }
  44. FLAGS = {
  45. # standard flags
  46. "i": SRE_FLAG_IGNORECASE,
  47. "L": SRE_FLAG_LOCALE,
  48. "m": SRE_FLAG_MULTILINE,
  49. "s": SRE_FLAG_DOTALL,
  50. "x": SRE_FLAG_VERBOSE,
  51. # extensions
  52. "a": SRE_FLAG_ASCII,
  53. "t": SRE_FLAG_TEMPLATE,
  54. "u": SRE_FLAG_UNICODE,
  55. }
  56. class Pattern:
  57. # master pattern object. keeps track of global attributes
  58. def __init__(self):
  59. self.flags = 0
  60. self.groupdict = {}
  61. self.groupwidths = [None] # group 0
  62. self.lookbehindgroups = None
  63. @property
  64. def groups(self):
  65. return len(self.groupwidths)
  66. def opengroup(self, name=None):
  67. gid = self.groups
  68. self.groupwidths.append(None)
  69. if self.groups > MAXGROUPS:
  70. raise error("too many groups")
  71. if name is not None:
  72. ogid = self.groupdict.get(name, None)
  73. if ogid is not None:
  74. raise error("redefinition of group name %r as group %d; "
  75. "was group %d" % (name, gid, ogid))
  76. self.groupdict[name] = gid
  77. return gid
  78. def closegroup(self, gid, p):
  79. self.groupwidths[gid] = p.getwidth()
  80. def checkgroup(self, gid):
  81. return gid < self.groups and self.groupwidths[gid] is not None
  82. def checklookbehindgroup(self, gid, source):
  83. if self.lookbehindgroups is not None:
  84. if not self.checkgroup(gid):
  85. raise source.error('cannot refer to an open group')
  86. if gid >= self.lookbehindgroups:
  87. raise source.error('cannot refer to group defined in the same '
  88. 'lookbehind subpattern')
  89. class SubPattern:
  90. # a subpattern, in intermediate form
  91. def __init__(self, pattern, data=None):
  92. self.pattern = pattern
  93. if data is None:
  94. data = []
  95. self.data = data
  96. self.width = None
  97. def dump(self, level=0):
  98. nl = True
  99. seqtypes = (tuple, list)
  100. for op, av in self.data:
  101. print(level*" " + str(op), end='')
  102. if op is IN:
  103. # member sublanguage
  104. print()
  105. for op, a in av:
  106. print((level+1)*" " + str(op), a)
  107. elif op is BRANCH:
  108. print()
  109. for i, a in enumerate(av[1]):
  110. if i:
  111. print(level*" " + "OR")
  112. a.dump(level+1)
  113. elif op is GROUPREF_EXISTS:
  114. condgroup, item_yes, item_no = av
  115. print('', condgroup)
  116. item_yes.dump(level+1)
  117. if item_no:
  118. print(level*" " + "ELSE")
  119. item_no.dump(level+1)
  120. elif isinstance(av, seqtypes):
  121. nl = False
  122. for a in av:
  123. if isinstance(a, SubPattern):
  124. if not nl:
  125. print()
  126. a.dump(level+1)
  127. nl = True
  128. else:
  129. if not nl:
  130. print(' ', end='')
  131. print(a, end='')
  132. nl = False
  133. if not nl:
  134. print()
  135. else:
  136. print('', av)
  137. def __repr__(self):
  138. return repr(self.data)
  139. def __len__(self):
  140. return len(self.data)
  141. def __delitem__(self, index):
  142. del self.data[index]
  143. def __getitem__(self, index):
  144. if isinstance(index, slice):
  145. return SubPattern(self.pattern, self.data[index])
  146. return self.data[index]
  147. def __setitem__(self, index, code):
  148. self.data[index] = code
  149. def insert(self, index, code):
  150. self.data.insert(index, code)
  151. def append(self, code):
  152. self.data.append(code)
  153. def getwidth(self):
  154. # determine the width (min, max) for this subpattern
  155. if self.width is not None:
  156. return self.width
  157. lo = hi = 0
  158. for op, av in self.data:
  159. if op is BRANCH:
  160. i = MAXREPEAT - 1
  161. j = 0
  162. for av in av[1]:
  163. l, h = av.getwidth()
  164. i = min(i, l)
  165. j = max(j, h)
  166. lo = lo + i
  167. hi = hi + j
  168. elif op is CALL:
  169. i, j = av.getwidth()
  170. lo = lo + i
  171. hi = hi + j
  172. elif op is SUBPATTERN:
  173. i, j = av[1].getwidth()
  174. lo = lo + i
  175. hi = hi + j
  176. elif op in _REPEATCODES:
  177. i, j = av[2].getwidth()
  178. lo = lo + i * av[0]
  179. hi = hi + j * av[1]
  180. elif op in _UNITCODES:
  181. lo = lo + 1
  182. hi = hi + 1
  183. elif op is GROUPREF:
  184. i, j = self.pattern.groupwidths[av]
  185. lo = lo + i
  186. hi = hi + j
  187. elif op is GROUPREF_EXISTS:
  188. i, j = av[1].getwidth()
  189. if av[2] is not None:
  190. l, h = av[2].getwidth()
  191. i = min(i, l)
  192. j = max(j, h)
  193. else:
  194. i = 0
  195. lo = lo + i
  196. hi = hi + j
  197. elif op is SUCCESS:
  198. break
  199. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  200. return self.width
  201. class Tokenizer:
  202. def __init__(self, string):
  203. self.istext = isinstance(string, str)
  204. self.string = string
  205. if not self.istext:
  206. string = str(string, 'latin1')
  207. self.decoded_string = string
  208. self.index = 0
  209. self.next = None
  210. self.__next()
  211. def __next(self):
  212. index = self.index
  213. try:
  214. char = self.decoded_string[index]
  215. except IndexError:
  216. self.next = None
  217. return
  218. if char == "\\":
  219. index += 1
  220. try:
  221. char += self.decoded_string[index]
  222. except IndexError:
  223. raise error("bad escape (end of pattern)",
  224. self.string, len(self.string) - 1) from None
  225. self.index = index + 1
  226. self.next = char
  227. def match(self, char):
  228. if char == self.next:
  229. self.__next()
  230. return True
  231. return False
  232. def get(self):
  233. this = self.next
  234. self.__next()
  235. return this
  236. def getwhile(self, n, charset):
  237. result = ''
  238. for _ in range(n):
  239. c = self.next
  240. if c not in charset:
  241. break
  242. result += c
  243. self.__next()
  244. return result
  245. def getuntil(self, terminator):
  246. result = ''
  247. while True:
  248. c = self.next
  249. self.__next()
  250. if c is None:
  251. if not result:
  252. raise self.error("missing group name")
  253. raise self.error("missing %s, unterminated name" % terminator,
  254. len(result))
  255. if c == terminator:
  256. if not result:
  257. raise self.error("missing group name", 1)
  258. break
  259. result += c
  260. return result
  261. def tell(self):
  262. return self.index - len(self.next or '')
  263. def seek(self, index):
  264. self.index = index
  265. self.__next()
  266. def error(self, msg, offset=0):
  267. return error(msg, self.string, self.tell() - offset)
  268. # The following three functions are not used in this module anymore, but we keep
  269. # them here (with DeprecationWarnings) for backwards compatibility.
  270. def isident(char):
  271. import warnings
  272. warnings.warn('sre_parse.isident() will be removed in 3.5',
  273. DeprecationWarning, stacklevel=2)
  274. return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
  275. def isdigit(char):
  276. import warnings
  277. warnings.warn('sre_parse.isdigit() will be removed in 3.5',
  278. DeprecationWarning, stacklevel=2)
  279. return "0" <= char <= "9"
  280. def isname(name):
  281. import warnings
  282. warnings.warn('sre_parse.isname() will be removed in 3.5',
  283. DeprecationWarning, stacklevel=2)
  284. # check that group name is a valid string
  285. if not isident(name[0]):
  286. return False
  287. for char in name[1:]:
  288. if not isident(char) and not isdigit(char):
  289. return False
  290. return True
  291. def _class_escape(source, escape):
  292. # handle escape code inside character class
  293. code = ESCAPES.get(escape)
  294. if code:
  295. return code
  296. code = CATEGORIES.get(escape)
  297. if code and code[0] is IN:
  298. return code
  299. try:
  300. c = escape[1:2]
  301. if c == "x":
  302. # hexadecimal escape (exactly two digits)
  303. escape += source.getwhile(2, HEXDIGITS)
  304. if len(escape) != 4:
  305. raise source.error("incomplete escape %s" % escape, len(escape))
  306. return LITERAL, int(escape[2:], 16)
  307. elif c == "u" and source.istext:
  308. # unicode escape (exactly four digits)
  309. escape += source.getwhile(4, HEXDIGITS)
  310. if len(escape) != 6:
  311. raise source.error("incomplete escape %s" % escape, len(escape))
  312. return LITERAL, int(escape[2:], 16)
  313. elif c == "U" and source.istext:
  314. # unicode escape (exactly eight digits)
  315. escape += source.getwhile(8, HEXDIGITS)
  316. if len(escape) != 10:
  317. raise source.error("incomplete escape %s" % escape, len(escape))
  318. c = int(escape[2:], 16)
  319. chr(c) # raise ValueError for invalid code
  320. return LITERAL, c
  321. elif c in OCTDIGITS:
  322. # octal escape (up to three digits)
  323. escape += source.getwhile(2, OCTDIGITS)
  324. c = int(escape[1:], 8)
  325. if c > 0o377:
  326. raise source.error('octal escape value %s outside of '
  327. 'range 0-0o377' % escape, len(escape))
  328. return LITERAL, c
  329. elif c in DIGITS:
  330. raise ValueError
  331. if len(escape) == 2:
  332. if c in ASCIILETTERS:
  333. import warnings
  334. warnings.warn('bad escape %s' % escape,
  335. DeprecationWarning, stacklevel=8)
  336. return LITERAL, ord(escape[1])
  337. except ValueError:
  338. pass
  339. raise source.error("bad escape %s" % escape, len(escape))
  340. def _escape(source, escape, state):
  341. # handle escape code in expression
  342. code = CATEGORIES.get(escape)
  343. if code:
  344. return code
  345. code = ESCAPES.get(escape)
  346. if code:
  347. return code
  348. try:
  349. c = escape[1:2]
  350. if c == "x":
  351. # hexadecimal escape
  352. escape += source.getwhile(2, HEXDIGITS)
  353. if len(escape) != 4:
  354. raise source.error("incomplete escape %s" % escape, len(escape))
  355. return LITERAL, int(escape[2:], 16)
  356. elif c == "u" and source.istext:
  357. # unicode escape (exactly four digits)
  358. escape += source.getwhile(4, HEXDIGITS)
  359. if len(escape) != 6:
  360. raise source.error("incomplete escape %s" % escape, len(escape))
  361. return LITERAL, int(escape[2:], 16)
  362. elif c == "U" and source.istext:
  363. # unicode escape (exactly eight digits)
  364. escape += source.getwhile(8, HEXDIGITS)
  365. if len(escape) != 10:
  366. raise source.error("incomplete escape %s" % escape, len(escape))
  367. c = int(escape[2:], 16)
  368. chr(c) # raise ValueError for invalid code
  369. return LITERAL, c
  370. elif c == "0":
  371. # octal escape
  372. escape += source.getwhile(2, OCTDIGITS)
  373. return LITERAL, int(escape[1:], 8)
  374. elif c in DIGITS:
  375. # octal escape *or* decimal group reference (sigh)
  376. if source.next in DIGITS:
  377. escape += source.get()
  378. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  379. source.next in OCTDIGITS):
  380. # got three octal digits; this is an octal escape
  381. escape += source.get()
  382. c = int(escape[1:], 8)
  383. if c > 0o377:
  384. raise source.error('octal escape value %s outside of '
  385. 'range 0-0o377' % escape,
  386. len(escape))
  387. return LITERAL, c
  388. # not an octal escape, so this is a group reference
  389. group = int(escape[1:])
  390. if group < state.groups:
  391. if not state.checkgroup(group):
  392. raise source.error("cannot refer to an open group",
  393. len(escape))
  394. state.checklookbehindgroup(group, source)
  395. return GROUPREF, group
  396. raise source.error("invalid group reference", len(escape))
  397. if len(escape) == 2:
  398. if c in ASCIILETTERS:
  399. import warnings
  400. warnings.warn('bad escape %s' % escape,
  401. DeprecationWarning, stacklevel=8)
  402. return LITERAL, ord(escape[1])
  403. except ValueError:
  404. pass
  405. raise source.error("bad escape %s" % escape, len(escape))
  406. def _parse_sub(source, state, nested=True):
  407. # parse an alternation: a|b|c
  408. items = []
  409. itemsappend = items.append
  410. sourcematch = source.match
  411. start = source.tell()
  412. while True:
  413. itemsappend(_parse(source, state))
  414. if not sourcematch("|"):
  415. break
  416. if len(items) == 1:
  417. return items[0]
  418. subpattern = SubPattern(state)
  419. subpatternappend = subpattern.append
  420. # check if all items share a common prefix
  421. while True:
  422. prefix = None
  423. for item in items:
  424. if not item:
  425. break
  426. if prefix is None:
  427. prefix = item[0]
  428. elif item[0] != prefix:
  429. break
  430. else:
  431. # all subitems start with a common "prefix".
  432. # move it out of the branch
  433. for item in items:
  434. del item[0]
  435. subpatternappend(prefix)
  436. continue # check next one
  437. break
  438. # check if the branch can be replaced by a character set
  439. for item in items:
  440. if len(item) != 1 or item[0][0] is not LITERAL:
  441. break
  442. else:
  443. # we can store this as a character set instead of a
  444. # branch (the compiler may optimize this even more)
  445. subpatternappend((IN, [item[0] for item in items]))
  446. return subpattern
  447. subpattern.append((BRANCH, (None, items)))
  448. return subpattern
  449. def _parse_sub_cond(source, state, condgroup):
  450. item_yes = _parse(source, state)
  451. if source.match("|"):
  452. item_no = _parse(source, state)
  453. if source.next == "|":
  454. raise source.error("conditional backref with more than two branches")
  455. else:
  456. item_no = None
  457. subpattern = SubPattern(state)
  458. subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  459. return subpattern
  460. def _parse(source, state):
  461. # parse a simple pattern
  462. subpattern = SubPattern(state)
  463. # precompute constants into local variables
  464. subpatternappend = subpattern.append
  465. sourceget = source.get
  466. sourcematch = source.match
  467. _len = len
  468. _ord = ord
  469. verbose = state.flags & SRE_FLAG_VERBOSE
  470. while True:
  471. this = source.next
  472. if this is None:
  473. break # end of pattern
  474. if this in "|)":
  475. break # end of subpattern
  476. sourceget()
  477. if verbose:
  478. # skip whitespace and comments
  479. if this in WHITESPACE:
  480. continue
  481. if this == "#":
  482. while True:
  483. this = sourceget()
  484. if this is None or this == "\n":
  485. break
  486. continue
  487. if this[0] == "\\":
  488. code = _escape(source, this, state)
  489. subpatternappend(code)
  490. elif this not in SPECIAL_CHARS:
  491. subpatternappend((LITERAL, _ord(this)))
  492. elif this == "[":
  493. here = source.tell() - 1
  494. # character set
  495. set = []
  496. setappend = set.append
  497. ## if sourcematch(":"):
  498. ## pass # handle character classes
  499. if sourcematch("^"):
  500. setappend((NEGATE, None))
  501. # check remaining characters
  502. start = set[:]
  503. while True:
  504. this = sourceget()
  505. if this is None:
  506. raise source.error("unterminated character set",
  507. source.tell() - here)
  508. if this == "]" and set != start:
  509. break
  510. elif this[0] == "\\":
  511. code1 = _class_escape(source, this)
  512. else:
  513. code1 = LITERAL, _ord(this)
  514. if sourcematch("-"):
  515. # potential range
  516. that = sourceget()
  517. if that is None:
  518. raise source.error("unterminated character set",
  519. source.tell() - here)
  520. if that == "]":
  521. if code1[0] is IN:
  522. code1 = code1[1][0]
  523. setappend(code1)
  524. setappend((LITERAL, _ord("-")))
  525. break
  526. if that[0] == "\\":
  527. code2 = _class_escape(source, that)
  528. else:
  529. code2 = LITERAL, _ord(that)
  530. if code1[0] != LITERAL or code2[0] != LITERAL:
  531. msg = "bad character range %s-%s" % (this, that)
  532. raise source.error(msg, len(this) + 1 + len(that))
  533. lo = code1[1]
  534. hi = code2[1]
  535. if hi < lo:
  536. msg = "bad character range %s-%s" % (this, that)
  537. raise source.error(msg, len(this) + 1 + len(that))
  538. setappend((RANGE, (lo, hi)))
  539. else:
  540. if code1[0] is IN:
  541. code1 = code1[1][0]
  542. setappend(code1)
  543. # XXX: <fl> should move set optimization to compiler!
  544. if _len(set)==1 and set[0][0] is LITERAL:
  545. subpatternappend(set[0]) # optimization
  546. elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
  547. subpatternappend((NOT_LITERAL, set[1][1])) # optimization
  548. else:
  549. # XXX: <fl> should add charmap optimization here
  550. subpatternappend((IN, set))
  551. elif this in REPEAT_CHARS:
  552. # repeat previous item
  553. here = source.tell()
  554. if this == "?":
  555. min, max = 0, 1
  556. elif this == "*":
  557. min, max = 0, MAXREPEAT
  558. elif this == "+":
  559. min, max = 1, MAXREPEAT
  560. elif this == "{":
  561. if source.next == "}":
  562. subpatternappend((LITERAL, _ord(this)))
  563. continue
  564. min, max = 0, MAXREPEAT
  565. lo = hi = ""
  566. while source.next in DIGITS:
  567. lo += sourceget()
  568. if sourcematch(","):
  569. while source.next in DIGITS:
  570. hi += sourceget()
  571. else:
  572. hi = lo
  573. if not sourcematch("}"):
  574. subpatternappend((LITERAL, _ord(this)))
  575. source.seek(here)
  576. continue
  577. if lo:
  578. min = int(lo)
  579. if min >= MAXREPEAT:
  580. raise OverflowError("the repetition number is too large")
  581. if hi:
  582. max = int(hi)
  583. if max >= MAXREPEAT:
  584. raise OverflowError("the repetition number is too large")
  585. if max < min:
  586. raise source.error("min repeat greater than max repeat",
  587. source.tell() - here)
  588. else:
  589. raise AssertionError("unsupported quantifier %r" % (char,))
  590. # figure out which item to repeat
  591. if subpattern:
  592. item = subpattern[-1:]
  593. else:
  594. item = None
  595. if not item or (_len(item) == 1 and item[0][0] is AT):
  596. raise source.error("nothing to repeat",
  597. source.tell() - here + len(this))
  598. if item[0][0] in _REPEATCODES:
  599. raise source.error("multiple repeat",
  600. source.tell() - here + len(this))
  601. if sourcematch("?"):
  602. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  603. else:
  604. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  605. elif this == ".":
  606. subpatternappend((ANY, None))
  607. elif this == "(":
  608. start = source.tell() - 1
  609. group = True
  610. name = None
  611. condgroup = None
  612. if sourcematch("?"):
  613. # options
  614. char = sourceget()
  615. if char is None:
  616. raise source.error("unexpected end of pattern")
  617. if char == "P":
  618. # python extensions
  619. if sourcematch("<"):
  620. # named group: skip forward to end of name
  621. name = source.getuntil(">")
  622. if not name.isidentifier():
  623. msg = "bad character in group name %r" % name
  624. raise source.error(msg, len(name) + 1)
  625. elif sourcematch("="):
  626. # named backreference
  627. name = source.getuntil(")")
  628. if not name.isidentifier():
  629. msg = "bad character in group name %r" % name
  630. raise source.error(msg, len(name) + 1)
  631. gid = state.groupdict.get(name)
  632. if gid is None:
  633. msg = "unknown group name %r" % name
  634. raise source.error(msg, len(name) + 1)
  635. if not state.checkgroup(gid):
  636. raise source.error("cannot refer to an open group",
  637. len(name) + 1)
  638. state.checklookbehindgroup(gid, source)
  639. subpatternappend((GROUPREF, gid))
  640. continue
  641. else:
  642. char = sourceget()
  643. if char is None:
  644. raise source.error("unexpected end of pattern")
  645. raise source.error("unknown extension ?P" + char,
  646. len(char) + 2)
  647. elif char == ":":
  648. # non-capturing group
  649. group = None
  650. elif char == "#":
  651. # comment
  652. while True:
  653. if source.next is None:
  654. raise source.error("missing ), unterminated comment",
  655. source.tell() - start)
  656. if sourceget() == ")":
  657. break
  658. continue
  659. elif char in "=!<":
  660. # lookahead assertions
  661. dir = 1
  662. if char == "<":
  663. char = sourceget()
  664. if char is None:
  665. raise source.error("unexpected end of pattern")
  666. if char not in "=!":
  667. raise source.error("unknown extension ?<" + char,
  668. len(char) + 2)
  669. dir = -1 # lookbehind
  670. lookbehindgroups = state.lookbehindgroups
  671. if lookbehindgroups is None:
  672. state.lookbehindgroups = state.groups
  673. p = _parse_sub(source, state)
  674. if dir < 0:
  675. if lookbehindgroups is None:
  676. state.lookbehindgroups = None
  677. if not sourcematch(")"):
  678. raise source.error("missing ), unterminated subpattern",
  679. source.tell() - start)
  680. if char == "=":
  681. subpatternappend((ASSERT, (dir, p)))
  682. else:
  683. subpatternappend((ASSERT_NOT, (dir, p)))
  684. continue
  685. elif char == "(":
  686. # conditional backreference group
  687. condname = source.getuntil(")")
  688. group = None
  689. if condname.isidentifier():
  690. condgroup = state.groupdict.get(condname)
  691. if condgroup is None:
  692. msg = "unknown group name %r" % condname
  693. raise source.error(msg, len(condname) + 1)
  694. else:
  695. try:
  696. condgroup = int(condname)
  697. if condgroup < 0:
  698. raise ValueError
  699. except ValueError:
  700. msg = "bad character in group name %r" % condname
  701. raise source.error(msg, len(condname) + 1) from None
  702. if not condgroup:
  703. raise source.error("bad group number",
  704. len(condname) + 1)
  705. if condgroup >= MAXGROUPS:
  706. raise source.error("invalid group reference",
  707. len(condname) + 1)
  708. state.checklookbehindgroup(condgroup, source)
  709. elif char in FLAGS:
  710. # flags
  711. while True:
  712. state.flags |= FLAGS[char]
  713. char = sourceget()
  714. if char is None:
  715. raise source.error("missing )")
  716. if char == ")":
  717. break
  718. if char not in FLAGS:
  719. raise source.error("unknown flag", len(char))
  720. verbose = state.flags & SRE_FLAG_VERBOSE
  721. continue
  722. else:
  723. raise source.error("unknown extension ?" + char,
  724. len(char) + 1)
  725. # parse group contents
  726. if group is not None:
  727. try:
  728. group = state.opengroup(name)
  729. except error as err:
  730. raise source.error(err.msg, len(name) + 1) from None
  731. if condgroup:
  732. p = _parse_sub_cond(source, state, condgroup)
  733. else:
  734. p = _parse_sub(source, state)
  735. if not source.match(")"):
  736. raise source.error("missing ), unterminated subpattern",
  737. source.tell() - start)
  738. if group is not None:
  739. state.closegroup(group, p)
  740. subpatternappend((SUBPATTERN, (group, p)))
  741. elif this == "^":
  742. subpatternappend((AT, AT_BEGINNING))
  743. elif this == "$":
  744. subpattern.append((AT, AT_END))
  745. else:
  746. raise AssertionError("unsupported special character %r" % (char,))
  747. return subpattern
  748. def fix_flags(src, flags):
  749. # Check and fix flags according to the type of pattern (str or bytes)
  750. if isinstance(src, str):
  751. if flags & SRE_FLAG_LOCALE:
  752. import warnings
  753. warnings.warn("LOCALE flag with a str pattern is deprecated. "
  754. "Will be an error in 3.6",
  755. DeprecationWarning, stacklevel=6)
  756. if not flags & SRE_FLAG_ASCII:
  757. flags |= SRE_FLAG_UNICODE
  758. elif flags & SRE_FLAG_UNICODE:
  759. raise ValueError("ASCII and UNICODE flags are incompatible")
  760. else:
  761. if flags & SRE_FLAG_UNICODE:
  762. raise ValueError("cannot use UNICODE flag with a bytes pattern")
  763. if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
  764. import warnings
  765. warnings.warn("ASCII and LOCALE flags are incompatible. "
  766. "Will be an error in 3.6",
  767. DeprecationWarning, stacklevel=6)
  768. return flags
  769. def parse(str, flags=0, pattern=None):
  770. # parse 're' pattern into list of (opcode, argument) tuples
  771. source = Tokenizer(str)
  772. if pattern is None:
  773. pattern = Pattern()
  774. pattern.flags = flags
  775. pattern.str = str
  776. p = _parse_sub(source, pattern, 0)
  777. p.pattern.flags = fix_flags(str, p.pattern.flags)
  778. if source.next is not None:
  779. assert source.next == ")"
  780. raise source.error("unbalanced parenthesis")
  781. if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
  782. # the VERBOSE flag was switched on inside the pattern. to be
  783. # on the safe side, we'll parse the whole thing again...
  784. return parse(str, p.pattern.flags)
  785. if flags & SRE_FLAG_DEBUG:
  786. p.dump()
  787. return p
  788. def parse_template(source, pattern):
  789. # parse 're' replacement string into list of literals and
  790. # group references
  791. s = Tokenizer(source)
  792. sget = s.get
  793. groups = []
  794. literals = []
  795. literal = []
  796. lappend = literal.append
  797. def addgroup(index):
  798. if literal:
  799. literals.append(''.join(literal))
  800. del literal[:]
  801. groups.append((len(literals), index))
  802. literals.append(None)
  803. groupindex = pattern.groupindex
  804. while True:
  805. this = sget()
  806. if this is None:
  807. break # end of replacement string
  808. if this[0] == "\\":
  809. # group
  810. c = this[1]
  811. if c == "g":
  812. name = ""
  813. if not s.match("<"):
  814. raise s.error("missing <")
  815. name = s.getuntil(">")
  816. if name.isidentifier():
  817. try:
  818. index = groupindex[name]
  819. except KeyError:
  820. raise IndexError("unknown group name %r" % name)
  821. else:
  822. try:
  823. index = int(name)
  824. if index < 0:
  825. raise ValueError
  826. except ValueError:
  827. raise s.error("bad character in group name %r" % name,
  828. len(name) + 1) from None
  829. if index >= MAXGROUPS:
  830. raise s.error("invalid group reference",
  831. len(name) + 1)
  832. addgroup(index)
  833. elif c == "0":
  834. if s.next in OCTDIGITS:
  835. this += sget()
  836. if s.next in OCTDIGITS:
  837. this += sget()
  838. lappend(chr(int(this[1:], 8) & 0xff))
  839. elif c in DIGITS:
  840. isoctal = False
  841. if s.next in DIGITS:
  842. this += sget()
  843. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  844. s.next in OCTDIGITS):
  845. this += sget()
  846. isoctal = True
  847. c = int(this[1:], 8)
  848. if c > 0o377:
  849. raise s.error('octal escape value %s outside of '
  850. 'range 0-0o377' % this, len(this))
  851. lappend(chr(c))
  852. if not isoctal:
  853. addgroup(int(this[1:]))
  854. else:
  855. try:
  856. this = chr(ESCAPES[this][1])
  857. except KeyError:
  858. if c in ASCIILETTERS:
  859. import warnings
  860. warnings.warn('bad escape %s' % this,
  861. DeprecationWarning, stacklevel=4)
  862. lappend(this)
  863. else:
  864. lappend(this)
  865. if literal:
  866. literals.append(''.join(literal))
  867. if not isinstance(source, str):
  868. # The tokenizer implicitly decodes bytes objects as latin-1, we must
  869. # therefore re-encode the final representation.
  870. literals = [None if s is None else s.encode('latin-1') for s in literals]
  871. return groups, literals
  872. def expand_template(template, match):
  873. g = match.group
  874. empty = match.string[:0]
  875. groups, literals = template
  876. literals = literals[:]
  877. try:
  878. for index, group in groups:
  879. literals[index] = g(group) or empty
  880. except IndexError:
  881. raise error("invalid group reference")
  882. return empty.join(literals)