diff options
-rw-r--r-- | stats.py | 29 | ||||
-rw-r--r-- | util.py | 11 |
2 files changed, 40 insertions, 0 deletions
diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f6c4401 --- /dev/null +++ b/stats.py | |||
@@ -0,0 +1,29 @@ | |||
1 | import scipy.stats as s | ||
2 | import numpy as np | ||
3 | |||
4 | from util import select | ||
5 | |||
6 | |||
7 | def iqr(vect): | ||
8 | "return inter-quartile range of a vector" | ||
9 | q25 = s.scoreatpercentile(vect, 25) | ||
10 | q75 = s.scoreatpercentile(vect, 75) | ||
11 | return (q75 - q25, q25, q75) | ||
12 | |||
13 | def cutoff_max(vect, percentile=99): | ||
14 | return s.scoreatpercentile(vect, percentile) | ||
15 | |||
16 | def iqr_is_not_outlier(table, col=1, extend=1.5): | ||
17 | "create a filter function that flags outliers" | ||
18 | (spread, low, high) = iqr(table[:,col]) | ||
19 | min_val = low - extend * spread | ||
20 | max_val = high + extend * spread | ||
21 | return lambda row: min_val <= row[col] <= max_val | ||
22 | |||
23 | def iqr_remove_outliers(table, col=1, extend=1.5): | ||
24 | """Return a copy that only includes rows that | ||
25 | fall within the IQR-based window. | ||
26 | """ | ||
27 | valid = iqr_is_not_outlier(table, col, extend) | ||
28 | return select(valid, table) | ||
29 | |||
@@ -0,0 +1,11 @@ | |||
1 | import numpy as np | ||
2 | |||
3 | def load_csv_file(fname, *args, **kargs): | ||
4 | return np.genfromtxt(open(fname), delimiter=",", comments="#", | ||
5 | *args, **kargs) | ||
6 | |||
7 | def select(keep, rows): | ||
8 | ok_rows = np.zeros(len(rows), dtype=bool) | ||
9 | for i, row in enumerate(rows): | ||
10 | ok_rows[i] = keep(row) | ||
11 | return rows[ok_rows] | ||