123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637 |
- """
- Basic statistics module.
- This module provides functions for calculating statistics of data, including
- averages, variance, and standard deviation.
- Calculating averages
- --------------------
- ================== =============================================
- Function Description
- ================== =============================================
- mean Arithmetic mean (average) of data.
- median Median (middle value) of data.
- median_low Low median of data.
- median_high High median of data.
- median_grouped Median, or 50th percentile, of grouped data.
- mode Mode (most common value) of data.
- ================== =============================================
- Calculate the arithmetic mean ("the average") of data:
- >>> mean([-1.0, 2.5, 3.25, 5.75])
- 2.625
- Calculate the standard median of discrete data:
- >>> median([2, 3, 4, 5])
- 3.5
- Calculate the median, or 50th percentile, of data grouped into class intervals
- centred on the data values provided. E.g. if your data points are rounded to
- the nearest whole number:
- >>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
- 2.8333333333...
- This should be interpreted in this way: you have two data points in the class
- interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
- the class interval 3.5-4.5. The median of these data points is 2.8333...
- Calculating variability or spread
- ---------------------------------
- ================== =============================================
- Function Description
- ================== =============================================
- pvariance Population variance of data.
- variance Sample variance of data.
- pstdev Population standard deviation of data.
- stdev Sample standard deviation of data.
- ================== =============================================
- Calculate the standard deviation of sample data:
- >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
- 4.38961843444...
- If you have previously calculated the mean, you can pass it as the optional
- second argument to the four "spread" functions to avoid recalculating it:
- >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
- >>> mu = mean(data)
- >>> pvariance(data, mu)
- 2.5
- Exceptions
- ----------
- A single exception is defined: StatisticsError is a subclass of ValueError.
- """
- __all__ = [ 'StatisticsError',
- 'pstdev', 'pvariance', 'stdev', 'variance',
- 'median', 'median_low', 'median_high', 'median_grouped',
- 'mean', 'mode',
- ]
- import collections
- import math
- from fractions import Fraction
- from decimal import Decimal
- from itertools import groupby
- class StatisticsError(ValueError):
- pass
- def _sum(data, start=0):
- """_sum(data [, start]) -> (type, sum, count)
- Return a high-precision sum of the given numeric data as a fraction,
- together with the type to be converted to and the count of items.
- If optional argument ``start`` is given, it is added to the total.
- If ``data`` is empty, ``start`` (defaulting to 0) is returned.
- Examples
- --------
- >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
- (<class 'float'>, Fraction(11, 1), 5)
- Some sources of round-off error will be avoided:
- >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
- (<class 'float'>, Fraction(1000, 1), 3000)
- Fractions and Decimals are also supported:
- >>> from fractions import Fraction as F
- >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
- (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
- >>> from decimal import Decimal as D
- >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
- >>> _sum(data)
- (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
- Mixed types are currently treated as an error, except that int is
- allowed.
- """
- count = 0
- n, d = _exact_ratio(start)
- partials = {d: n}
- partials_get = partials.get
- T = _coerce(int, type(start))
- for typ, values in groupby(data, type):
- T = _coerce(T, typ)
- for n,d in map(_exact_ratio, values):
- count += 1
- partials[d] = partials_get(d, 0) + n
- if None in partials:
-
-
- total = partials[None]
- assert not _isfinite(total)
- else:
-
-
- total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
- return (T, total, count)
- def _isfinite(x):
- try:
- return x.is_finite()
- except AttributeError:
- return math.isfinite(x)
- def _coerce(T, S):
- """Coerce types T and S to a common type, or raise TypeError.
- Coercion rules are currently an implementation detail. See the CoerceTest
- test class in test_statistics for details.
- """
-
- assert T is not bool, "initial type T is bool"
-
-
-
- if T is S: return T
-
- if S is int or S is bool: return T
- if T is int: return S
-
- if issubclass(S, T): return S
- if issubclass(T, S): return T
-
- if issubclass(T, int): return S
- if issubclass(S, int): return T
-
- if issubclass(T, Fraction) and issubclass(S, float):
- return S
- if issubclass(T, float) and issubclass(S, Fraction):
- return T
-
- msg = "don't know how to coerce %s and %s"
- raise TypeError(msg % (T.__name__, S.__name__))
- def _exact_ratio(x):
- """Return Real number x to exact (numerator, denominator) pair.
- >>> _exact_ratio(0.25)
- (1, 4)
- x is expected to be an int, Fraction, Decimal or float.
- """
- try:
-
-
-
- if type(x) is float:
- return x.as_integer_ratio()
- try:
-
- return (x.numerator, x.denominator)
- except AttributeError:
- try:
-
- return x.as_integer_ratio()
- except AttributeError:
- try:
-
- return _decimal_to_ratio(x)
- except AttributeError:
-
- pass
- except (OverflowError, ValueError):
-
- assert not math.isfinite(x)
- return (x, None)
- msg = "can't convert type '{}' to numerator/denominator"
- raise TypeError(msg.format(type(x).__name__))
- def _decimal_to_ratio(d):
- """Convert Decimal d to exact integer ratio (numerator, denominator).
- >>> from decimal import Decimal
- >>> _decimal_to_ratio(Decimal("2.6"))
- (26, 10)
- """
- sign, digits, exp = d.as_tuple()
- if exp in ('F', 'n', 'N'):
- assert not d.is_finite()
- return (d, None)
- num = 0
- for digit in digits:
- num = num*10 + digit
- if exp < 0:
- den = 10**-exp
- else:
- num *= 10**exp
- den = 1
- if sign:
- num = -num
- return (num, den)
- def _convert(value, T):
- """Convert value to given numeric type T."""
- if type(value) is T:
-
-
- return value
- if issubclass(T, int) and value.denominator != 1:
- T = float
- try:
-
- return T(value)
- except TypeError:
- if issubclass(T, Decimal):
- return T(value.numerator)/T(value.denominator)
- else:
- raise
- def _counts(data):
-
- table = collections.Counter(iter(data)).most_common()
- if not table:
- return table
-
- maxfreq = table[0][1]
- for i in range(1, len(table)):
- if table[i][1] != maxfreq:
- table = table[:i]
- break
- return table
- def mean(data):
- """Return the sample arithmetic mean of data.
- >>> mean([1, 2, 3, 4, 4])
- 2.8
- >>> from fractions import Fraction as F
- >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
- Fraction(13, 21)
- >>> from decimal import Decimal as D
- >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
- Decimal('0.5625')
- If ``data`` is empty, StatisticsError will be raised.
- """
- if iter(data) is data:
- data = list(data)
- n = len(data)
- if n < 1:
- raise StatisticsError('mean requires at least one data point')
- T, total, count = _sum(data)
- assert count == n
- return _convert(total/n, T)
- def median(data):
- """Return the median (middle value) of numeric data.
- When the number of data points is odd, return the middle data point.
- When the number of data points is even, the median is interpolated by
- taking the average of the two middle values:
- >>> median([1, 3, 5])
- 3
- >>> median([1, 3, 5, 7])
- 4.0
- """
- data = sorted(data)
- n = len(data)
- if n == 0:
- raise StatisticsError("no median for empty data")
- if n%2 == 1:
- return data[n//2]
- else:
- i = n//2
- return (data[i - 1] + data[i])/2
- def median_low(data):
- """Return the low median of numeric data.
- When the number of data points is odd, the middle value is returned.
- When it is even, the smaller of the two middle values is returned.
- >>> median_low([1, 3, 5])
- 3
- >>> median_low([1, 3, 5, 7])
- 3
- """
- data = sorted(data)
- n = len(data)
- if n == 0:
- raise StatisticsError("no median for empty data")
- if n%2 == 1:
- return data[n//2]
- else:
- return data[n//2 - 1]
- def median_high(data):
- """Return the high median of data.
- When the number of data points is odd, the middle value is returned.
- When it is even, the larger of the two middle values is returned.
- >>> median_high([1, 3, 5])
- 3
- >>> median_high([1, 3, 5, 7])
- 5
- """
- data = sorted(data)
- n = len(data)
- if n == 0:
- raise StatisticsError("no median for empty data")
- return data[n//2]
- def median_grouped(data, interval=1):
- """Return the 50th percentile (median) of grouped continuous data.
- >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
- 3.7
- >>> median_grouped([52, 52, 53, 54])
- 52.5
- This calculates the median as the 50th percentile, and should be
- used when your data is continuous and grouped. In the above example,
- the values 1, 2, 3, etc. actually represent the midpoint of classes
- 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
- class 3.5-4.5, and interpolation is used to estimate it.
- Optional argument ``interval`` represents the class interval, and
- defaults to 1. Changing the class interval naturally will change the
- interpolated 50th percentile value:
- >>> median_grouped([1, 3, 3, 5, 7], interval=1)
- 3.25
- >>> median_grouped([1, 3, 3, 5, 7], interval=2)
- 3.5
- This function does not check whether the data points are at least
- ``interval`` apart.
- """
- data = sorted(data)
- n = len(data)
- if n == 0:
- raise StatisticsError("no median for empty data")
- elif n == 1:
- return data[0]
-
-
- x = data[n//2]
- for obj in (x, interval):
- if isinstance(obj, (str, bytes)):
- raise TypeError('expected number but got %r' % obj)
- try:
- L = x - interval/2
- except TypeError:
-
- L = float(x) - float(interval)/2
- cf = data.index(x)
-
- f = data.count(x)
- return L + interval*(n/2 - cf)/f
- def mode(data):
- """Return the most common data point from discrete or nominal data.
- ``mode`` assumes discrete data, and returns a single value. This is the
- standard treatment of the mode as commonly taught in schools:
- >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
- 3
- This also works with nominal (non-numeric) data:
- >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
- 'red'
- If there is not exactly one most common value, ``mode`` will raise
- StatisticsError.
- """
-
- table = _counts(data)
- if len(table) == 1:
- return table[0][0]
- elif table:
- raise StatisticsError(
- 'no unique mode; found %d equally common values' % len(table)
- )
- else:
- raise StatisticsError('no mode for empty data')
- def _ss(data, c=None):
- """Return sum of square deviations of sequence data.
- If ``c`` is None, the mean is calculated in one pass, and the deviations
- from the mean are calculated in a second pass. Otherwise, deviations are
- calculated from ``c`` as given. Use the second case with care, as it can
- lead to garbage results.
- """
- if c is None:
- c = mean(data)
- T, total, count = _sum((x-c)**2 for x in data)
-
-
- U, total2, count2 = _sum((x-c) for x in data)
- assert T == U and count == count2
- total -= total2**2/len(data)
- assert not total < 0, 'negative sum of square deviations: %f' % total
- return (T, total)
- def variance(data, xbar=None):
- """Return the sample variance of data.
- data should be an iterable of Real-valued numbers, with at least two
- values. The optional argument xbar, if given, should be the mean of
- the data. If it is missing or None, the mean is automatically calculated.
- Use this function when your data is a sample from a population. To
- calculate the variance from the entire population, see ``pvariance``.
- Examples:
- >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
- >>> variance(data)
- 1.3720238095238095
- If you have already calculated the mean of your data, you can pass it as
- the optional second argument ``xbar`` to avoid recalculating it:
- >>> m = mean(data)
- >>> variance(data, m)
- 1.3720238095238095
- This function does not check that ``xbar`` is actually the mean of
- ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
- impossible results.
- Decimals and Fractions are supported:
- >>> from decimal import Decimal as D
- >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
- Decimal('31.01875')
- >>> from fractions import Fraction as F
- >>> variance([F(1, 6), F(1, 2), F(5, 3)])
- Fraction(67, 108)
- """
- if iter(data) is data:
- data = list(data)
- n = len(data)
- if n < 2:
- raise StatisticsError('variance requires at least two data points')
- T, ss = _ss(data, xbar)
- return _convert(ss/(n-1), T)
- def pvariance(data, mu=None):
- """Return the population variance of ``data``.
- data should be an iterable of Real-valued numbers, with at least one
- value. The optional argument mu, if given, should be the mean of
- the data. If it is missing or None, the mean is automatically calculated.
- Use this function to calculate the variance from the entire population.
- To estimate the variance from a sample, the ``variance`` function is
- usually a better choice.
- Examples:
- >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
- >>> pvariance(data)
- 1.25
- If you have already calculated the mean of the data, you can pass it as
- the optional second argument to avoid recalculating it:
- >>> mu = mean(data)
- >>> pvariance(data, mu)
- 1.25
- This function does not check that ``mu`` is actually the mean of ``data``.
- Giving arbitrary values for ``mu`` may lead to invalid or impossible
- results.
- Decimals and Fractions are supported:
- >>> from decimal import Decimal as D
- >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
- Decimal('24.815')
- >>> from fractions import Fraction as F
- >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
- Fraction(13, 72)
- """
- if iter(data) is data:
- data = list(data)
- n = len(data)
- if n < 1:
- raise StatisticsError('pvariance requires at least one data point')
- T, ss = _ss(data, mu)
- return _convert(ss/n, T)
- def stdev(data, xbar=None):
- """Return the square root of the sample variance.
- See ``variance`` for arguments and other details.
- >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
- 1.0810874155219827
- """
- var = variance(data, xbar)
- try:
- return var.sqrt()
- except AttributeError:
- return math.sqrt(var)
- def pstdev(data, mu=None):
- """Return the square root of the population variance.
- See ``pvariance`` for arguments and other details.
- >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
- 0.986893273527251
- """
- var = pvariance(data, mu)
- try:
- return var.sqrt()
- except AttributeError:
- return math.sqrt(var)
|