Support for compacting binary data

author: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-18 03:04:46 -0500
committer: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-18 03:04:46 -0500
commit: 89e5192f1ee83ebb3a7bd87aefc5d23ce4ab2c2b (patch)
tree: 4cdc41b4750719c13eb8f6534fce879c577dbd88
parent: e37ed32d9b861581942ec5cfb8948f0602c0a481 (diff)
3 files changed, 44 insertions, 0 deletions
diff --git a/binary_data.py b/binary_data.py
new file mode 100644
index 0000000..e5b47aa
--- /dev/null
+++ b/binary_data.py
@@ -0,0 +1,29 @@
+import numpy
+from util import load_binary_file
+from stats import iqr_remove_outliers, iqr_cutoff
+def compact_file(fname, scale=None, extend=1.5):
+    data = load_binary_file(fname)
+    if not scale is None:
+        data *= scale
+    data.sort()
+    iqr_min, iqr_max = iqr_cutoff(data, extend)
+    min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max])
+    samples = data[min_idx:max_idx]
+    filtered = len(data) - len(samples)
+    max = samples[-1]
+    min = samples[0]
+    med = numpy.median(samples)
+    avg = numpy.mean(samples)
+    std = numpy.std(samples)
+    var = numpy.var(samples)
+    return [len(samples), filtered, max, avg, min, med, std, var]
diff --git a/stats.py b/stats.py
index f6c4401..cede673 100644
--- a/stats.py
+++ b/stats.py
@@ -13,6 +13,12 @@ def iqr(vect):
 def cutoff_max(vect, percentile=99):
    return s.scoreatpercentile(vect, percentile)
+def iqr_cutoff(vect, extend):
+    (spread, low, high) = iqr(vect)
+    min_val = low  - extend * spread
+    max_val = high + extend * spread
+    return min_val, max_val
 def iqr_is_not_outlier(table, col=1, extend=1.5):
    "create a filter function that flags outliers"
    (spread, low, high) = iqr(table[:,col])
diff --git a/util.py b/util.py
index 7d81d4f..b44cc6c 100644
--- a/util.py
+++ b/util.py
@@ -8,6 +8,15 @@ def load_csv_file(fname, *args, **kargs):
    f.close() # don't leak file handles
    return data
+def load_csv_file_fast(fname):
+    data = np.loadtxt(fname, delimiter=",")
+    return data
+def load_binary_file(fname, dtype='float32', modify=False):
+    data = np.memmap(fname, dtype=dtype,
+                     mode='r+' if modify else 'c')
+    return data
 def write_csv_file(fname, rows, header=None, width=None,
                   break_col=None):
    if fname is None:
author	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-18 03:04:46 -0500
committer	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-18 03:04:46 -0500
commit	89e5192f1ee83ebb3a7bd87aefc5d23ce4ab2c2b (patch)
tree	4cdc41b4750719c13eb8f6534fce879c577dbd88
parent	e37ed32d9b861581942ec5cfb8948f0602c0a481 (diff)

diff --git a/binary_data.py b/binary_data.py new file mode 100644 index 0000000..e5b47aa --- /dev/null +++ b/binary_data.py
@@ -0,0 +1,29 @@
		1	import numpy
		2
		3	from util import load_binary_file
		4	from stats import iqr_remove_outliers, iqr_cutoff
		5
		6	def compact_file(fname, scale=None, extend=1.5):
		7	data = load_binary_file(fname)
		8
		9	if not scale is None:
		10	data *= scale
		11
		12	data.sort()
		13
		14	iqr_min, iqr_max = iqr_cutoff(data, extend)
		15
		16	min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max])
		17
		18	samples = data[min_idx:max_idx]
		19
		20	filtered = len(data) - len(samples)
		21	max = samples[-1]
		22	min = samples[0]
		23	med = numpy.median(samples)
		24	avg = numpy.mean(samples)
		25
		26	std = numpy.std(samples)
		27	var = numpy.var(samples)
		28
		29	return [len(samples), filtered, max, avg, min, med, std, var]


diff --git a/stats.py b/stats.py index f6c4401..cede673 100644 --- a/stats.py +++ b/stats.py
@@ -13,6 +13,12 @@ def iqr(vect):
13	def cutoff_max(vect, percentile=99):	13	def cutoff_max(vect, percentile=99):
14	return s.scoreatpercentile(vect, percentile)	14	return s.scoreatpercentile(vect, percentile)
15		15
		16	def iqr_cutoff(vect, extend):
		17	(spread, low, high) = iqr(vect)
		18	min_val = low - extend * spread
		19	max_val = high + extend * spread
		20	return min_val, max_val
		21
16	def iqr_is_not_outlier(table, col=1, extend=1.5):	22	def iqr_is_not_outlier(table, col=1, extend=1.5):
17	"create a filter function that flags outliers"	23	"create a filter function that flags outliers"
18	(spread, low, high) = iqr(table[:,col])	24	(spread, low, high) = iqr(table[:,col])


diff --git a/util.py b/util.py index 7d81d4f..b44cc6c 100644 --- a/util.py +++ b/util.py
@@ -8,6 +8,15 @@ def load_csv_file(fname, args, *kargs):
8	f.close() # don't leak file handles	8	f.close() # don't leak file handles
9	return data	9	return data
10		10
		11	def load_csv_file_fast(fname):
		12	data = np.loadtxt(fname, delimiter=",")
		13	return data
		14
		15	def load_binary_file(fname, dtype='float32', modify=False):
		16	data = np.memmap(fname, dtype=dtype,
		17	mode='r+' if modify else 'c')
		18	return data
		19
11	def write_csv_file(fname, rows, header=None, width=None,	20	def write_csv_file(fname, rows, header=None, width=None,
12	break_col=None):	21	break_col=None):
13	if fname is None:	22	if fname is None: