2 files changed, 40 insertions, 0 deletions
diff --git a/stats.py b/stats.py
new file mode 100644
index 0000000..f6c4401
--- /dev/null
+++ b/stats.py
@@ -0,0 +1,29 @@
+import scipy.stats as s
+import numpy as np
+from util import select
+def iqr(vect):
+    "return inter-quartile range of a vector"
+    q25 = s.scoreatpercentile(vect, 25)
+    q75 = s.scoreatpercentile(vect, 75)
+    return (q75 - q25, q25, q75)
+def cutoff_max(vect, percentile=99):
+    return s.scoreatpercentile(vect, percentile)
+def iqr_is_not_outlier(table, col=1, extend=1.5):
+    "create a filter function that flags outliers"
+    (spread, low, high) = iqr(table[:,col])
+    min_val = low  - extend * spread
+    max_val = high + extend * spread
+    return lambda row: min_val <= row[col] <= max_val
+def iqr_remove_outliers(table, col=1, extend=1.5):
+    """Return a copy that only includes rows that
+    fall within the IQR-based window.
+    """
+    valid = iqr_is_not_outlier(table, col, extend)
+    return select(valid, table)
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..fdedaa4
--- /dev/null
+++ b/util.py
@@ -0,0 +1,11 @@
+import numpy as np
+def load_csv_file(fname, *args, **kargs):
+    return np.genfromtxt(open(fname), delimiter=",", comments="#",
+                         *args, **kargs)
+def select(keep, rows):
+    ok_rows = np.zeros(len(rows), dtype=bool)
+    for i, row in enumerate(rows):
+        ok_rows[i] = keep(row)
+    return rows[ok_rows]

diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..f6c4401 --- /dev/null +++ b/stats.py
@@ -0,0 +1,29 @@
	1	import scipy.stats as s
	2	import numpy as np
	3
	4	from util import select
	5
	6
	7	def iqr(vect):
	8	"return inter-quartile range of a vector"
	9	q25 = s.scoreatpercentile(vect, 25)
	10	q75 = s.scoreatpercentile(vect, 75)
	11	return (q75 - q25, q25, q75)
	12
	13	def cutoff_max(vect, percentile=99):
	14	return s.scoreatpercentile(vect, percentile)
	15
	16	def iqr_is_not_outlier(table, col=1, extend=1.5):
	17	"create a filter function that flags outliers"
	18	(spread, low, high) = iqr(table[:,col])
	19	min_val = low - extend * spread
	20	max_val = high + extend * spread
	21	return lambda row: min_val <= row[col] <= max_val
	22
	23	def iqr_remove_outliers(table, col=1, extend=1.5):
	24	"""Return a copy that only includes rows that
	25	fall within the IQR-based window.
	26	"""
	27	valid = iqr_is_not_outlier(table, col, extend)
	28	return select(valid, table)
	29


diff --git a/util.py b/util.py new file mode 100644 index 0000000..fdedaa4 --- /dev/null +++ b/util.py
@@ -0,0 +1,11 @@
	1	import numpy as np
	2
	3	def load_csv_file(fname, args, *kargs):
	4	return np.genfromtxt(open(fname), delimiter=",", comments="#",
	5	args, *kargs)
	6
	7	def select(keep, rows):
	8	ok_rows = np.zeros(len(rows), dtype=bool)
	9	for i, row in enumerate(rows):
	10	ok_rows[i] = keep(row)
	11	return rows[ok_rows]