diff options
author | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-03-26 14:55:28 -0400 |
---|---|---|
committer | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-03-26 14:55:28 -0400 |
commit | da4b7d22647b48ca1c0420540ee7c3eada2281cc (patch) | |
tree | 1814088c72ad5ba9b5a378c22de8880598f4d14c /stats.py | |
parent | 40333dd77fecd8a2dec2e982a89678e0849bf3be (diff) |
Add numpy/scipy based statistics and helpers.
Diffstat (limited to 'stats.py')
-rw-r--r-- | stats.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f6c4401 --- /dev/null +++ b/stats.py | |||
@@ -0,0 +1,29 @@ | |||
1 | import scipy.stats as s | ||
2 | import numpy as np | ||
3 | |||
4 | from util import select | ||
5 | |||
6 | |||
7 | def iqr(vect): | ||
8 | "return inter-quartile range of a vector" | ||
9 | q25 = s.scoreatpercentile(vect, 25) | ||
10 | q75 = s.scoreatpercentile(vect, 75) | ||
11 | return (q75 - q25, q25, q75) | ||
12 | |||
13 | def cutoff_max(vect, percentile=99): | ||
14 | return s.scoreatpercentile(vect, percentile) | ||
15 | |||
16 | def iqr_is_not_outlier(table, col=1, extend=1.5): | ||
17 | "create a filter function that flags outliers" | ||
18 | (spread, low, high) = iqr(table[:,col]) | ||
19 | min_val = low - extend * spread | ||
20 | max_val = high + extend * spread | ||
21 | return lambda row: min_val <= row[col] <= max_val | ||
22 | |||
23 | def iqr_remove_outliers(table, col=1, extend=1.5): | ||
24 | """Return a copy that only includes rows that | ||
25 | fall within the IQR-based window. | ||
26 | """ | ||
27 | valid = iqr_is_not_outlier(table, col, extend) | ||
28 | return select(valid, table) | ||
29 | |||