diff options
| -rw-r--r-- | binary_data.py | 29 | ||||
| -rw-r--r-- | stats.py | 6 | ||||
| -rw-r--r-- | util.py | 9 |
3 files changed, 44 insertions, 0 deletions
diff --git a/binary_data.py b/binary_data.py new file mode 100644 index 0000000..e5b47aa --- /dev/null +++ b/binary_data.py | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | import numpy | ||
| 2 | |||
| 3 | from util import load_binary_file | ||
| 4 | from stats import iqr_remove_outliers, iqr_cutoff | ||
| 5 | |||
| 6 | def compact_file(fname, scale=None, extend=1.5): | ||
| 7 | data = load_binary_file(fname) | ||
| 8 | |||
| 9 | if not scale is None: | ||
| 10 | data *= scale | ||
| 11 | |||
| 12 | data.sort() | ||
| 13 | |||
| 14 | iqr_min, iqr_max = iqr_cutoff(data, extend) | ||
| 15 | |||
| 16 | min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max]) | ||
| 17 | |||
| 18 | samples = data[min_idx:max_idx] | ||
| 19 | |||
| 20 | filtered = len(data) - len(samples) | ||
| 21 | max = samples[-1] | ||
| 22 | min = samples[0] | ||
| 23 | med = numpy.median(samples) | ||
| 24 | avg = numpy.mean(samples) | ||
| 25 | |||
| 26 | std = numpy.std(samples) | ||
| 27 | var = numpy.var(samples) | ||
| 28 | |||
| 29 | return [len(samples), filtered, max, avg, min, med, std, var] | ||
| @@ -13,6 +13,12 @@ def iqr(vect): | |||
| 13 | def cutoff_max(vect, percentile=99): | 13 | def cutoff_max(vect, percentile=99): |
| 14 | return s.scoreatpercentile(vect, percentile) | 14 | return s.scoreatpercentile(vect, percentile) |
| 15 | 15 | ||
| 16 | def iqr_cutoff(vect, extend): | ||
| 17 | (spread, low, high) = iqr(vect) | ||
| 18 | min_val = low - extend * spread | ||
| 19 | max_val = high + extend * spread | ||
| 20 | return min_val, max_val | ||
| 21 | |||
| 16 | def iqr_is_not_outlier(table, col=1, extend=1.5): | 22 | def iqr_is_not_outlier(table, col=1, extend=1.5): |
| 17 | "create a filter function that flags outliers" | 23 | "create a filter function that flags outliers" |
| 18 | (spread, low, high) = iqr(table[:,col]) | 24 | (spread, low, high) = iqr(table[:,col]) |
| @@ -8,6 +8,15 @@ def load_csv_file(fname, *args, **kargs): | |||
| 8 | f.close() # don't leak file handles | 8 | f.close() # don't leak file handles |
| 9 | return data | 9 | return data |
| 10 | 10 | ||
| 11 | def load_csv_file_fast(fname): | ||
| 12 | data = np.loadtxt(fname, delimiter=",") | ||
| 13 | return data | ||
| 14 | |||
| 15 | def load_binary_file(fname, dtype='float32', modify=False): | ||
| 16 | data = np.memmap(fname, dtype=dtype, | ||
| 17 | mode='r+' if modify else 'c') | ||
| 18 | return data | ||
| 19 | |||
| 11 | def write_csv_file(fname, rows, header=None, width=None, | 20 | def write_csv_file(fname, rows, header=None, width=None, |
| 12 | break_col=None): | 21 | break_col=None): |
| 13 | if fname is None: | 22 | if fname is None: |
