counter.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. # -*- Mode: Python -*-
  2. # GObject-Introspection - a framework for introspecting GObject libraries
  3. # Copyright (C) 2013 Dieter Verfaillie <dieterv@optionexplicit.be>
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; either version 2
  8. # of the License, or (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  18. # 02110-1301, USA.
  19. #
  20. from __future__ import absolute_import
  21. try:
  22. from collections import Counter
  23. except ImportError:
  24. # collections.Counter for Python 2.6, backported from
  25. # http://hg.python.org/cpython/file/d047928ae3f6/Lib/collections/__init__.py#l402
  26. from operator import itemgetter
  27. from heapq import nlargest
  28. from itertools import repeat, ifilter
  29. class Counter(dict):
  30. '''Dict subclass for counting hashable items. Sometimes called a bag
  31. or multiset. Elements are stored as dictionary keys and their counts
  32. are stored as dictionary values.
  33. >>> c = Counter('abcdeabcdabcaba') # count elements from a string
  34. >>> c.most_common(3) # three most common elements
  35. [('a', 5), ('b', 4), ('c', 3)]
  36. >>> sorted(c) # list all unique elements
  37. ['a', 'b', 'c', 'd', 'e']
  38. >>> ''.join(sorted(c.elements())) # list elements with repetitions
  39. 'aaaaabbbbcccdde'
  40. >>> sum(c.values()) # total of all counts
  41. 15
  42. >>> c['a'] # count of letter 'a'
  43. 5
  44. >>> for elem in 'shazam': # update counts from an iterable
  45. ... c[elem] += 1 # by adding 1 to each element's count
  46. >>> c['a'] # now there are seven 'a'
  47. 7
  48. >>> del c['b'] # remove all 'b'
  49. >>> c['b'] # now there are zero 'b'
  50. 0
  51. >>> d = Counter('simsalabim') # make another counter
  52. >>> c.update(d) # add in the second counter
  53. >>> c['a'] # now there are nine 'a'
  54. 9
  55. >>> c.clear() # empty the counter
  56. >>> c
  57. Counter()
  58. Note: If a count is set to zero or reduced to zero, it will remain
  59. in the counter until the entry is deleted or the counter is cleared:
  60. >>> c = Counter('aaabbc')
  61. >>> c['b'] -= 2 # reduce the count of 'b' by two
  62. >>> c.most_common() # 'b' is still in, but its count is zero
  63. [('a', 3), ('c', 1), ('b', 0)]
  64. '''
  65. # References:
  66. # http://en.wikipedia.org/wiki/Multiset
  67. # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
  68. # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
  69. # http://code.activestate.com/recipes/259174/
  70. # Knuth, TAOCP Vol. II section 4.6.3
  71. def __init__(self, iterable=None, **kwds):
  72. '''Create a new, empty Counter object. And if given, count elements
  73. from an input iterable. Or, initialize the count from another mapping
  74. of elements to their counts.
  75. >>> c = Counter() # a new, empty counter
  76. >>> c = Counter('gallahad') # a new counter from an iterable
  77. >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
  78. >>> c = Counter(a=4, b=2) # a new counter from keyword args
  79. '''
  80. self.update(iterable, **kwds)
  81. def __missing__(self, key):
  82. 'The count of elements not in the Counter is zero.'
  83. # Needed so that self[missing_item] does not raise KeyError
  84. return 0
  85. def most_common(self, n=None):
  86. '''List the n most common elements and their counts from the most
  87. common to the least. If n is None, then list all element counts.
  88. >>> Counter('abcdeabcdabcaba').most_common(3)
  89. [('a', 5), ('b', 4), ('c', 3)]
  90. '''
  91. # Emulate Bag.sortedByCount from Smalltalk
  92. if n is None:
  93. return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
  94. return nlargest(n, self.iteritems(), key=itemgetter(1))
  95. def elements(self):
  96. '''Iterator over elements repeating each as many times as its count.
  97. >>> c = Counter('ABCABC')
  98. >>> sorted(c.elements())
  99. ['A', 'A', 'B', 'B', 'C', 'C']
  100. # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1
  101. >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
  102. >>> product = 1
  103. >>> for factor in prime_factors.elements(): # loop over factors
  104. ... product *= factor # and multiply them
  105. >>> product
  106. 1836
  107. Note, if an element's count has been set to zero or is a negative
  108. number, elements() will ignore it.
  109. '''
  110. # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
  111. for elem, count in self.iteritems():
  112. for _ in repeat(None, count):
  113. yield elem
  114. # Override dict methods where necessary
  115. @classmethod
  116. def fromkeys(cls, iterable, v=None):
  117. # There is no equivalent method for counters because setting v=1
  118. # means that no element can have a count greater than one.
  119. raise NotImplementedError(
  120. 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
  121. def update(self, iterable=None, **kwds):
  122. '''Like dict.update() but add counts instead of replacing them.
  123. Source can be an iterable, a dictionary, or another Counter instance.
  124. >>> c = Counter('which')
  125. >>> c.update('witch') # add elements from another iterable
  126. >>> d = Counter('watch')
  127. >>> c.update(d) # add elements from another counter
  128. >>> c['h'] # four 'h' in which, witch, and watch
  129. 4
  130. '''
  131. # The regular dict.update() operation makes no sense here because the
  132. # replace behavior results in the some of original untouched counts
  133. # being mixed-in with all of the other counts for a mismash that
  134. # doesn't have a straight-forward interpretation in most counting
  135. # contexts. Instead, we implement straight-addition. Both the inputs
  136. # and outputs are allowed to contain zero and negative counts.
  137. if iterable is not None:
  138. if hasattr(iterable, 'iteritems'):
  139. if self:
  140. self_get = self.get
  141. for elem, count in iterable.iteritems():
  142. self[elem] = self_get(elem, 0) + count
  143. else:
  144. dict.update(self, iterable) # fast path when counter is empty
  145. else:
  146. self_get = self.get
  147. for elem in iterable:
  148. self[elem] = self_get(elem, 0) + 1
  149. if kwds:
  150. self.update(kwds)
  151. def subtract(self, iterable=None, **kwds):
  152. '''Like dict.update() but subtracts counts instead of replacing them.
  153. Counts can be reduced below zero. Both the inputs and outputs are
  154. allowed to contain zero and negative counts.
  155. Source can be an iterable, a dictionary, or another Counter instance.
  156. >>> c = Counter('which')
  157. >>> c.subtract('witch') # subtract elements from another iterable
  158. >>> c.subtract(Counter('watch')) # subtract elements from another counter
  159. >>> c['h'] # 2 in which, minus 1 in witch, minus 1 in watch
  160. 0
  161. >>> c['w'] # 1 in which, minus 1 in witch, minus 1 in watch
  162. -1
  163. '''
  164. if hasattr(iterable, 'iteritems'):
  165. for elem, count in iterable.iteritems():
  166. self[elem] -= count
  167. else:
  168. for elem in iterable:
  169. self[elem] -= 1
  170. def copy(self):
  171. 'Return a shallow copy.'
  172. return self.__class__(self)
  173. def __reduce__(self):
  174. return self.__class__, (dict(self), )
  175. def __delitem__(self, elem):
  176. 'Like dict.__delitem__() but does not raise KeyError for missing values.'
  177. if elem in self:
  178. dict.__delitem__(self, elem)
  179. def __repr__(self):
  180. if not self:
  181. return '%s()' % self.__class__.__name__
  182. items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
  183. return '%s({%s})' % (self.__class__.__name__, items)
  184. # Multiset-style mathematical operations discussed in:
  185. # Knuth TAOCP Volume II section 4.6.3 exercise 19
  186. # and at http://en.wikipedia.org/wiki/Multiset
  187. #
  188. # Outputs guaranteed to only include positive counts.
  189. #
  190. # To strip negative and zero counts, add-in an empty counter:
  191. # c += Counter()
  192. def __add__(self, other):
  193. '''Add counts from two counters.
  194. >>> Counter('abbb') + Counter('bcc')
  195. Counter({'b': 4, 'c': 2, 'a': 1})
  196. '''
  197. if not isinstance(other, Counter):
  198. return NotImplemented
  199. result = Counter()
  200. for elem in set(self) | set(other):
  201. newcount = self[elem] + other[elem]
  202. if newcount > 0:
  203. result[elem] = newcount
  204. return result
  205. def __sub__(self, other):
  206. ''' Subtract count, but keep only results with positive counts.
  207. >>> Counter('abbbc') - Counter('bccd')
  208. Counter({'b': 2, 'a': 1})
  209. '''
  210. if not isinstance(other, Counter):
  211. return NotImplemented
  212. result = Counter()
  213. for elem in set(self) | set(other):
  214. newcount = self[elem] - other[elem]
  215. if newcount > 0:
  216. result[elem] = newcount
  217. return result
  218. def __or__(self, other):
  219. '''Union is the maximum of value in either of the input counters.
  220. >>> Counter('abbb') | Counter('bcc')
  221. Counter({'b': 3, 'c': 2, 'a': 1})
  222. '''
  223. if not isinstance(other, Counter):
  224. return NotImplemented
  225. _max = max
  226. result = Counter()
  227. for elem in set(self) | set(other):
  228. newcount = _max(self[elem], other[elem])
  229. if newcount > 0:
  230. result[elem] = newcount
  231. return result
  232. def __and__(self, other):
  233. ''' Intersection is the minimum of corresponding counts.
  234. >>> Counter('abbb') & Counter('bcc')
  235. Counter({'b': 1})
  236. '''
  237. if not isinstance(other, Counter):
  238. return NotImplemented
  239. _min = min
  240. result = Counter()
  241. if len(self) < len(other):
  242. self, other = other, self
  243. for elem in ifilter(self.__contains__, other):
  244. newcount = _min(self[elem], other[elem])
  245. if newcount > 0:
  246. result[elem] = newcount
  247. return result
  248. if __name__ == '__main__':
  249. import doctest
  250. print(doctest.testmod())