aboutsummaryrefslogtreecommitdiffstats
path: root/stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'stats.py')
-rw-r--r--stats.py29
1 files changed, 29 insertions, 0 deletions
diff --git a/stats.py b/stats.py
new file mode 100644
index 0000000..f6c4401
--- /dev/null
+++ b/stats.py
@@ -0,0 +1,29 @@
1import scipy.stats as s
2import numpy as np
3
4from util import select
5
6
7def iqr(vect):
8 "return inter-quartile range of a vector"
9 q25 = s.scoreatpercentile(vect, 25)
10 q75 = s.scoreatpercentile(vect, 75)
11 return (q75 - q25, q25, q75)
12
13def cutoff_max(vect, percentile=99):
14 return s.scoreatpercentile(vect, percentile)
15
16def iqr_is_not_outlier(table, col=1, extend=1.5):
17 "create a filter function that flags outliers"
18 (spread, low, high) = iqr(table[:,col])
19 min_val = low - extend * spread
20 max_val = high + extend * spread
21 return lambda row: min_val <= row[col] <= max_val
22
23def iqr_remove_outliers(table, col=1, extend=1.5):
24 """Return a copy that only includes rows that
25 fall within the IQR-based window.
26 """
27 valid = iqr_is_not_outlier(table, col, extend)
28 return select(valid, table)
29