diff options
| author | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-03-26 14:55:28 -0400 |
|---|---|---|
| committer | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-03-26 14:55:28 -0400 |
| commit | da4b7d22647b48ca1c0420540ee7c3eada2281cc (patch) | |
| tree | 1814088c72ad5ba9b5a378c22de8880598f4d14c | |
| parent | 40333dd77fecd8a2dec2e982a89678e0849bf3be (diff) | |
Add numpy/scipy based statistics and helpers.
| -rw-r--r-- | stats.py | 29 | ||||
| -rw-r--r-- | util.py | 11 |
2 files changed, 40 insertions, 0 deletions
diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f6c4401 --- /dev/null +++ b/stats.py | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | import scipy.stats as s | ||
| 2 | import numpy as np | ||
| 3 | |||
| 4 | from util import select | ||
| 5 | |||
| 6 | |||
| 7 | def iqr(vect): | ||
| 8 | "return inter-quartile range of a vector" | ||
| 9 | q25 = s.scoreatpercentile(vect, 25) | ||
| 10 | q75 = s.scoreatpercentile(vect, 75) | ||
| 11 | return (q75 - q25, q25, q75) | ||
| 12 | |||
| 13 | def cutoff_max(vect, percentile=99): | ||
| 14 | return s.scoreatpercentile(vect, percentile) | ||
| 15 | |||
| 16 | def iqr_is_not_outlier(table, col=1, extend=1.5): | ||
| 17 | "create a filter function that flags outliers" | ||
| 18 | (spread, low, high) = iqr(table[:,col]) | ||
| 19 | min_val = low - extend * spread | ||
| 20 | max_val = high + extend * spread | ||
| 21 | return lambda row: min_val <= row[col] <= max_val | ||
| 22 | |||
| 23 | def iqr_remove_outliers(table, col=1, extend=1.5): | ||
| 24 | """Return a copy that only includes rows that | ||
| 25 | fall within the IQR-based window. | ||
| 26 | """ | ||
| 27 | valid = iqr_is_not_outlier(table, col, extend) | ||
| 28 | return select(valid, table) | ||
| 29 | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | import numpy as np | ||
| 2 | |||
| 3 | def load_csv_file(fname, *args, **kargs): | ||
| 4 | return np.genfromtxt(open(fname), delimiter=",", comments="#", | ||
| 5 | *args, **kargs) | ||
| 6 | |||
| 7 | def select(keep, rows): | ||
| 8 | ok_rows = np.zeros(len(rows), dtype=bool) | ||
| 9 | for i, row in enumerate(rows): | ||
| 10 | ok_rows[i] = keep(row) | ||
| 11 | return rows[ok_rows] | ||
