diff options
author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-02-18 03:04:46 -0500 |
---|---|---|
committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-02-18 03:04:46 -0500 |
commit | 89e5192f1ee83ebb3a7bd87aefc5d23ce4ab2c2b (patch) | |
tree | 4cdc41b4750719c13eb8f6534fce879c577dbd88 | |
parent | e37ed32d9b861581942ec5cfb8948f0602c0a481 (diff) |
Support for compacting binary data
-rw-r--r-- | binary_data.py | 29 | ||||
-rw-r--r-- | stats.py | 6 | ||||
-rw-r--r-- | util.py | 9 |
3 files changed, 44 insertions, 0 deletions
diff --git a/binary_data.py b/binary_data.py new file mode 100644 index 0000000..e5b47aa --- /dev/null +++ b/binary_data.py | |||
@@ -0,0 +1,29 @@ | |||
1 | import numpy | ||
2 | |||
3 | from util import load_binary_file | ||
4 | from stats import iqr_remove_outliers, iqr_cutoff | ||
5 | |||
6 | def compact_file(fname, scale=None, extend=1.5): | ||
7 | data = load_binary_file(fname) | ||
8 | |||
9 | if not scale is None: | ||
10 | data *= scale | ||
11 | |||
12 | data.sort() | ||
13 | |||
14 | iqr_min, iqr_max = iqr_cutoff(data, extend) | ||
15 | |||
16 | min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max]) | ||
17 | |||
18 | samples = data[min_idx:max_idx] | ||
19 | |||
20 | filtered = len(data) - len(samples) | ||
21 | max = samples[-1] | ||
22 | min = samples[0] | ||
23 | med = numpy.median(samples) | ||
24 | avg = numpy.mean(samples) | ||
25 | |||
26 | std = numpy.std(samples) | ||
27 | var = numpy.var(samples) | ||
28 | |||
29 | return [len(samples), filtered, max, avg, min, med, std, var] | ||
@@ -13,6 +13,12 @@ def iqr(vect): | |||
13 | def cutoff_max(vect, percentile=99): | 13 | def cutoff_max(vect, percentile=99): |
14 | return s.scoreatpercentile(vect, percentile) | 14 | return s.scoreatpercentile(vect, percentile) |
15 | 15 | ||
16 | def iqr_cutoff(vect, extend): | ||
17 | (spread, low, high) = iqr(vect) | ||
18 | min_val = low - extend * spread | ||
19 | max_val = high + extend * spread | ||
20 | return min_val, max_val | ||
21 | |||
16 | def iqr_is_not_outlier(table, col=1, extend=1.5): | 22 | def iqr_is_not_outlier(table, col=1, extend=1.5): |
17 | "create a filter function that flags outliers" | 23 | "create a filter function that flags outliers" |
18 | (spread, low, high) = iqr(table[:,col]) | 24 | (spread, low, high) = iqr(table[:,col]) |
@@ -8,6 +8,15 @@ def load_csv_file(fname, *args, **kargs): | |||
8 | f.close() # don't leak file handles | 8 | f.close() # don't leak file handles |
9 | return data | 9 | return data |
10 | 10 | ||
11 | def load_csv_file_fast(fname): | ||
12 | data = np.loadtxt(fname, delimiter=",") | ||
13 | return data | ||
14 | |||
15 | def load_binary_file(fname, dtype='float32', modify=False): | ||
16 | data = np.memmap(fname, dtype=dtype, | ||
17 | mode='r+' if modify else 'c') | ||
18 | return data | ||
19 | |||
11 | def write_csv_file(fname, rows, header=None, width=None, | 20 | def write_csv_file(fname, rows, header=None, width=None, |
12 | break_col=None): | 21 | break_col=None): |
13 | if fname is None: | 22 | if fname is None: |