From 89e5192f1ee83ebb3a7bd87aefc5d23ce4ab2c2b Mon Sep 17 00:00:00 2001 From: "Bjoern B. Brandenburg" Date: Fri, 18 Feb 2011 03:04:46 -0500 Subject: Support for compacting binary data --- binary_data.py | 29 +++++++++++++++++++++++++++++ stats.py | 6 ++++++ util.py | 9 +++++++++ 3 files changed, 44 insertions(+) create mode 100644 binary_data.py diff --git a/binary_data.py b/binary_data.py new file mode 100644 index 0000000..e5b47aa --- /dev/null +++ b/binary_data.py @@ -0,0 +1,29 @@ +import numpy + +from util import load_binary_file +from stats import iqr_remove_outliers, iqr_cutoff + +def compact_file(fname, scale=None, extend=1.5): + data = load_binary_file(fname) + + if not scale is None: + data *= scale + + data.sort() + + iqr_min, iqr_max = iqr_cutoff(data, extend) + + min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max]) + + samples = data[min_idx:max_idx] + + filtered = len(data) - len(samples) + max = samples[-1] + min = samples[0] + med = numpy.median(samples) + avg = numpy.mean(samples) + + std = numpy.std(samples) + var = numpy.var(samples) + + return [len(samples), filtered, max, avg, min, med, std, var] diff --git a/stats.py b/stats.py index f6c4401..cede673 100644 --- a/stats.py +++ b/stats.py @@ -13,6 +13,12 @@ def iqr(vect): def cutoff_max(vect, percentile=99): return s.scoreatpercentile(vect, percentile) +def iqr_cutoff(vect, extend): + (spread, low, high) = iqr(vect) + min_val = low - extend * spread + max_val = high + extend * spread + return min_val, max_val + def iqr_is_not_outlier(table, col=1, extend=1.5): "create a filter function that flags outliers" (spread, low, high) = iqr(table[:,col]) diff --git a/util.py b/util.py index 7d81d4f..b44cc6c 100644 --- a/util.py +++ b/util.py @@ -8,6 +8,15 @@ def load_csv_file(fname, *args, **kargs): f.close() # don't leak file handles return data +def load_csv_file_fast(fname): + data = np.loadtxt(fname, delimiter=",") + return data + +def load_binary_file(fname, dtype='float32', modify=False): + data = np.memmap(fname, dtype=dtype, + mode='r+' if modify else 'c') + return data + def write_csv_file(fname, rows, header=None, width=None, break_col=None): if fname is None: -- cgit v1.2.2