From da4b7d22647b48ca1c0420540ee7c3eada2281cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20B=2E=20Brandenburg?= Date: Fri, 26 Mar 2010 14:55:28 -0400 Subject: Add numpy/scipy based statistics and helpers. --- stats.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 stats.py (limited to 'stats.py') diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f6c4401 --- /dev/null +++ b/stats.py @@ -0,0 +1,29 @@ +import scipy.stats as s +import numpy as np + +from util import select + + +def iqr(vect): + "return inter-quartile range of a vector" + q25 = s.scoreatpercentile(vect, 25) + q75 = s.scoreatpercentile(vect, 75) + return (q75 - q25, q25, q75) + +def cutoff_max(vect, percentile=99): + return s.scoreatpercentile(vect, percentile) + +def iqr_is_not_outlier(table, col=1, extend=1.5): + "create a filter function that flags outliers" + (spread, low, high) = iqr(table[:,col]) + min_val = low - extend * spread + max_val = high + extend * spread + return lambda row: min_val <= row[col] <= max_val + +def iqr_remove_outliers(table, col=1, extend=1.5): + """Return a copy that only includes rows that + fall within the IQR-based window. + """ + valid = iqr_is_not_outlier(table, col, extend) + return select(valid, table) + -- cgit v1.2.2