btm_utils.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. "Utility functions used by the btm_matcher module"
  2. from . import pytree
  3. from .pgen2 import grammar, token
  4. from .pygram import pattern_symbols, python_symbols
  5. syms = pattern_symbols
  6. pysyms = python_symbols
  7. tokens = grammar.opmap
  8. token_labels = token
  9. TYPE_ANY = -1
  10. TYPE_ALTERNATIVES = -2
  11. TYPE_GROUP = -3
  12. class MinNode(object):
  13. """This class serves as an intermediate representation of the
  14. pattern tree during the conversion to sets of leaf-to-root
  15. subpatterns"""
  16. def __init__(self, type=None, name=None):
  17. self.type = type
  18. self.name = name
  19. self.children = []
  20. self.leaf = False
  21. self.parent = None
  22. self.alternatives = []
  23. self.group = []
  24. def __repr__(self):
  25. return str(self.type) + ' ' + str(self.name)
  26. def leaf_to_root(self):
  27. """Internal method. Returns a characteristic path of the
  28. pattern tree. This method must be run for all leaves until the
  29. linear subpatterns are merged into a single"""
  30. node = self
  31. subp = []
  32. while node:
  33. if node.type == TYPE_ALTERNATIVES:
  34. node.alternatives.append(subp)
  35. if len(node.alternatives) == len(node.children):
  36. #last alternative
  37. subp = [tuple(node.alternatives)]
  38. node.alternatives = []
  39. node = node.parent
  40. continue
  41. else:
  42. node = node.parent
  43. subp = None
  44. break
  45. if node.type == TYPE_GROUP:
  46. node.group.append(subp)
  47. #probably should check the number of leaves
  48. if len(node.group) == len(node.children):
  49. subp = get_characteristic_subpattern(node.group)
  50. node.group = []
  51. node = node.parent
  52. continue
  53. else:
  54. node = node.parent
  55. subp = None
  56. break
  57. if node.type == token_labels.NAME and node.name:
  58. #in case of type=name, use the name instead
  59. subp.append(node.name)
  60. else:
  61. subp.append(node.type)
  62. node = node.parent
  63. return subp
  64. def get_linear_subpattern(self):
  65. """Drives the leaf_to_root method. The reason that
  66. leaf_to_root must be run multiple times is because we need to
  67. reject 'group' matches; for example the alternative form
  68. (a | b c) creates a group [b c] that needs to be matched. Since
  69. matching multiple linear patterns overcomes the automaton's
  70. capabilities, leaf_to_root merges each group into a single
  71. choice based on 'characteristic'ity,
  72. i.e. (a|b c) -> (a|b) if b more characteristic than c
  73. Returns: The most 'characteristic'(as defined by
  74. get_characteristic_subpattern) path for the compiled pattern
  75. tree.
  76. """
  77. for l in self.leaves():
  78. subp = l.leaf_to_root()
  79. if subp:
  80. return subp
  81. def leaves(self):
  82. "Generator that returns the leaves of the tree"
  83. for child in self.children:
  84. for x in child.leaves():
  85. yield x
  86. if not self.children:
  87. yield self
  88. def reduce_tree(node, parent=None):
  89. """
  90. Internal function. Reduces a compiled pattern tree to an
  91. intermediate representation suitable for feeding the
  92. automaton. This also trims off any optional pattern elements(like
  93. [a], a*).
  94. """
  95. new_node = None
  96. #switch on the node type
  97. if node.type == syms.Matcher:
  98. #skip
  99. node = node.children[0]
  100. if node.type == syms.Alternatives :
  101. #2 cases
  102. if len(node.children) <= 2:
  103. #just a single 'Alternative', skip this node
  104. new_node = reduce_tree(node.children[0], parent)
  105. else:
  106. #real alternatives
  107. new_node = MinNode(type=TYPE_ALTERNATIVES)
  108. #skip odd children('|' tokens)
  109. for child in node.children:
  110. if node.children.index(child)%2:
  111. continue
  112. reduced = reduce_tree(child, new_node)
  113. if reduced is not None:
  114. new_node.children.append(reduced)
  115. elif node.type == syms.Alternative:
  116. if len(node.children) > 1:
  117. new_node = MinNode(type=TYPE_GROUP)
  118. for child in node.children:
  119. reduced = reduce_tree(child, new_node)
  120. if reduced:
  121. new_node.children.append(reduced)
  122. if not new_node.children:
  123. # delete the group if all of the children were reduced to None
  124. new_node = None
  125. else:
  126. new_node = reduce_tree(node.children[0], parent)
  127. elif node.type == syms.Unit:
  128. if (isinstance(node.children[0], pytree.Leaf) and
  129. node.children[0].value == '('):
  130. #skip parentheses
  131. return reduce_tree(node.children[1], parent)
  132. if ((isinstance(node.children[0], pytree.Leaf) and
  133. node.children[0].value == '[')
  134. or
  135. (len(node.children)>1 and
  136. hasattr(node.children[1], "value") and
  137. node.children[1].value == '[')):
  138. #skip whole unit if its optional
  139. return None
  140. leaf = True
  141. details_node = None
  142. alternatives_node = None
  143. has_repeater = False
  144. repeater_node = None
  145. has_variable_name = False
  146. for child in node.children:
  147. if child.type == syms.Details:
  148. leaf = False
  149. details_node = child
  150. elif child.type == syms.Repeater:
  151. has_repeater = True
  152. repeater_node = child
  153. elif child.type == syms.Alternatives:
  154. alternatives_node = child
  155. if hasattr(child, 'value') and child.value == '=': # variable name
  156. has_variable_name = True
  157. #skip variable name
  158. if has_variable_name:
  159. #skip variable name, '='
  160. name_leaf = node.children[2]
  161. if hasattr(name_leaf, 'value') and name_leaf.value == '(':
  162. # skip parenthesis
  163. name_leaf = node.children[3]
  164. else:
  165. name_leaf = node.children[0]
  166. #set node type
  167. if name_leaf.type == token_labels.NAME:
  168. #(python) non-name or wildcard
  169. if name_leaf.value == 'any':
  170. new_node = MinNode(type=TYPE_ANY)
  171. else:
  172. if hasattr(token_labels, name_leaf.value):
  173. new_node = MinNode(type=getattr(token_labels, name_leaf.value))
  174. else:
  175. new_node = MinNode(type=getattr(pysyms, name_leaf.value))
  176. elif name_leaf.type == token_labels.STRING:
  177. #(python) name or character; remove the apostrophes from
  178. #the string value
  179. name = name_leaf.value.strip("'")
  180. if name in tokens:
  181. new_node = MinNode(type=tokens[name])
  182. else:
  183. new_node = MinNode(type=token_labels.NAME, name=name)
  184. elif name_leaf.type == syms.Alternatives:
  185. new_node = reduce_tree(alternatives_node, parent)
  186. #handle repeaters
  187. if has_repeater:
  188. if repeater_node.children[0].value == '*':
  189. #reduce to None
  190. new_node = None
  191. elif repeater_node.children[0].value == '+':
  192. #reduce to a single occurrence i.e. do nothing
  193. pass
  194. else:
  195. #TODO: handle {min, max} repeaters
  196. raise NotImplementedError
  197. pass
  198. #add children
  199. if details_node and new_node is not None:
  200. for child in details_node.children[1:-1]:
  201. #skip '<', '>' markers
  202. reduced = reduce_tree(child, new_node)
  203. if reduced is not None:
  204. new_node.children.append(reduced)
  205. if new_node:
  206. new_node.parent = parent
  207. return new_node
  208. def get_characteristic_subpattern(subpatterns):
  209. """Picks the most characteristic from a list of linear patterns
  210. Current order used is:
  211. names > common_names > common_chars
  212. """
  213. if not isinstance(subpatterns, list):
  214. return subpatterns
  215. if len(subpatterns)==1:
  216. return subpatterns[0]
  217. # first pick out the ones containing variable names
  218. subpatterns_with_names = []
  219. subpatterns_with_common_names = []
  220. common_names = ['in', 'for', 'if' , 'not', 'None']
  221. subpatterns_with_common_chars = []
  222. common_chars = "[]().,:"
  223. for subpattern in subpatterns:
  224. if any(rec_test(subpattern, lambda x: type(x) is str)):
  225. if any(rec_test(subpattern,
  226. lambda x: isinstance(x, str) and x in common_chars)):
  227. subpatterns_with_common_chars.append(subpattern)
  228. elif any(rec_test(subpattern,
  229. lambda x: isinstance(x, str) and x in common_names)):
  230. subpatterns_with_common_names.append(subpattern)
  231. else:
  232. subpatterns_with_names.append(subpattern)
  233. if subpatterns_with_names:
  234. subpatterns = subpatterns_with_names
  235. elif subpatterns_with_common_names:
  236. subpatterns = subpatterns_with_common_names
  237. elif subpatterns_with_common_chars:
  238. subpatterns = subpatterns_with_common_chars
  239. # of the remaining subpatterns pick out the longest one
  240. return max(subpatterns, key=len)
  241. def rec_test(sequence, test_func):
  242. """Tests test_func on all items of sequence and items of included
  243. sub-iterables"""
  244. for x in sequence:
  245. if isinstance(x, (list, tuple)):
  246. for y in rec_test(x, test_func):
  247. yield y
  248. else:
  249. yield test_func(x)