aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjörn B. Brandenburg <bbb@cs.unc.edu>2010-03-26 14:55:28 -0400
committerBjörn B. Brandenburg <bbb@cs.unc.edu>2010-03-26 14:55:28 -0400
commitda4b7d22647b48ca1c0420540ee7c3eada2281cc (patch)
tree1814088c72ad5ba9b5a378c22de8880598f4d14c
parent40333dd77fecd8a2dec2e982a89678e0849bf3be (diff)
Add numpy/scipy based statistics and helpers.
-rw-r--r--stats.py29
-rw-r--r--util.py11
2 files changed, 40 insertions, 0 deletions
diff --git a/stats.py b/stats.py
new file mode 100644
index 0000000..f6c4401
--- /dev/null
+++ b/stats.py
@@ -0,0 +1,29 @@
1import scipy.stats as s
2import numpy as np
3
4from util import select
5
6
7def iqr(vect):
8 "return inter-quartile range of a vector"
9 q25 = s.scoreatpercentile(vect, 25)
10 q75 = s.scoreatpercentile(vect, 75)
11 return (q75 - q25, q25, q75)
12
13def cutoff_max(vect, percentile=99):
14 return s.scoreatpercentile(vect, percentile)
15
16def iqr_is_not_outlier(table, col=1, extend=1.5):
17 "create a filter function that flags outliers"
18 (spread, low, high) = iqr(table[:,col])
19 min_val = low - extend * spread
20 max_val = high + extend * spread
21 return lambda row: min_val <= row[col] <= max_val
22
23def iqr_remove_outliers(table, col=1, extend=1.5):
24 """Return a copy that only includes rows that
25 fall within the IQR-based window.
26 """
27 valid = iqr_is_not_outlier(table, col, extend)
28 return select(valid, table)
29
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..fdedaa4
--- /dev/null
+++ b/util.py
@@ -0,0 +1,11 @@
1import numpy as np
2
3def load_csv_file(fname, *args, **kargs):
4 return np.genfromtxt(open(fname), delimiter=",", comments="#",
5 *args, **kargs)
6
7def select(keep, rows):
8 ok_rows = np.zeros(len(rows), dtype=bool)
9 for i, row in enumerate(rows):
10 ok_rows[i] = keep(row)
11 return rows[ok_rows]