aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjoern B. Brandenburg <bbb@cs.unc.edu>2011-02-18 03:04:46 -0500
committerBjoern B. Brandenburg <bbb@cs.unc.edu>2011-02-18 03:04:46 -0500
commit89e5192f1ee83ebb3a7bd87aefc5d23ce4ab2c2b (patch)
tree4cdc41b4750719c13eb8f6534fce879c577dbd88
parente37ed32d9b861581942ec5cfb8948f0602c0a481 (diff)
Support for compacting binary data
-rw-r--r--binary_data.py29
-rw-r--r--stats.py6
-rw-r--r--util.py9
3 files changed, 44 insertions, 0 deletions
diff --git a/binary_data.py b/binary_data.py
new file mode 100644
index 0000000..e5b47aa
--- /dev/null
+++ b/binary_data.py
@@ -0,0 +1,29 @@
1import numpy
2
3from util import load_binary_file
4from stats import iqr_remove_outliers, iqr_cutoff
5
6def compact_file(fname, scale=None, extend=1.5):
7 data = load_binary_file(fname)
8
9 if not scale is None:
10 data *= scale
11
12 data.sort()
13
14 iqr_min, iqr_max = iqr_cutoff(data, extend)
15
16 min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max])
17
18 samples = data[min_idx:max_idx]
19
20 filtered = len(data) - len(samples)
21 max = samples[-1]
22 min = samples[0]
23 med = numpy.median(samples)
24 avg = numpy.mean(samples)
25
26 std = numpy.std(samples)
27 var = numpy.var(samples)
28
29 return [len(samples), filtered, max, avg, min, med, std, var]
diff --git a/stats.py b/stats.py
index f6c4401..cede673 100644
--- a/stats.py
+++ b/stats.py
@@ -13,6 +13,12 @@ def iqr(vect):
13def cutoff_max(vect, percentile=99): 13def cutoff_max(vect, percentile=99):
14 return s.scoreatpercentile(vect, percentile) 14 return s.scoreatpercentile(vect, percentile)
15 15
16def iqr_cutoff(vect, extend):
17 (spread, low, high) = iqr(vect)
18 min_val = low - extend * spread
19 max_val = high + extend * spread
20 return min_val, max_val
21
16def iqr_is_not_outlier(table, col=1, extend=1.5): 22def iqr_is_not_outlier(table, col=1, extend=1.5):
17 "create a filter function that flags outliers" 23 "create a filter function that flags outliers"
18 (spread, low, high) = iqr(table[:,col]) 24 (spread, low, high) = iqr(table[:,col])
diff --git a/util.py b/util.py
index 7d81d4f..b44cc6c 100644
--- a/util.py
+++ b/util.py
@@ -8,6 +8,15 @@ def load_csv_file(fname, *args, **kargs):
8 f.close() # don't leak file handles 8 f.close() # don't leak file handles
9 return data 9 return data
10 10
11def load_csv_file_fast(fname):
12 data = np.loadtxt(fname, delimiter=",")
13 return data
14
15def load_binary_file(fname, dtype='float32', modify=False):
16 data = np.memmap(fname, dtype=dtype,
17 mode='r+' if modify else 'c')
18 return data
19
11def write_csv_file(fname, rows, header=None, width=None, 20def write_csv_file(fname, rows, header=None, width=None,
12 break_col=None): 21 break_col=None):
13 if fname is None: 22 if fname is None: