diff options
| author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-02-21 15:14:15 -0500 |
|---|---|---|
| committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-02-21 15:14:15 -0500 |
| commit | 95ce3338ad1208ee44f334b87fea2c32a7b888d4 (patch) | |
| tree | 750eed2557a482e506fc620e34133d136ee7c45d | |
| parent | 60f818697e2cee718c75e78999945fe2b628c72b (diff) | |
add scripts for shuffling and truncating sample files
| -rwxr-xr-x | count-all | 17 | ||||
| -rwxr-xr-x | shuffle-truncate-all | 18 | ||||
| -rwxr-xr-x | shuffle_truncate.py | 102 |
3 files changed, 137 insertions, 0 deletions
diff --git a/count-all b/count-all new file mode 100755 index 0000000..7a353bc --- /dev/null +++ b/count-all | |||
| @@ -0,0 +1,17 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | |||
| 3 | EVENTS="SCHED SCHED2 TICK CXS RELEASE RELEASE-LATENCY SEND-RESCHED" | ||
| 4 | |||
| 5 | DIR="$1" | ||
| 6 | |||
| 7 | if [ -z "$DIR" ] | ||
| 8 | then | ||
| 9 | DIR=. | ||
| 10 | fi | ||
| 11 | |||
| 12 | for E in $EVENTS | ||
| 13 | do | ||
| 14 | FILES=`find "$DIR" -iname "*overhead=${E}.bin"` | ||
| 15 | shuffle_truncate.py --count $FILES > counts_overhead=${E}.txt | ||
| 16 | done | ||
| 17 | |||
diff --git a/shuffle-truncate-all b/shuffle-truncate-all new file mode 100755 index 0000000..b83c360 --- /dev/null +++ b/shuffle-truncate-all | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | |||
| 3 | COUNTS="$1" | ||
| 4 | DIR="$2" | ||
| 5 | |||
| 6 | if [ -z "$DIR" ] | ||
| 7 | then | ||
| 8 | DIR=. | ||
| 9 | fi | ||
| 10 | |||
| 11 | EVENTS=`awk '{print $1}' $COUNTS` | ||
| 12 | |||
| 13 | for E in $EVENTS | ||
| 14 | do | ||
| 15 | CUTOFF=`grep $E $COUNTS | awk '{print $2}'` | ||
| 16 | FILES=`find "$DIR" -iname "*overhead=${E}.bin"` | ||
| 17 | shuffle_truncate.py -c $CUTOFF $FILES | ||
| 18 | done | ||
diff --git a/shuffle_truncate.py b/shuffle_truncate.py new file mode 100755 index 0000000..6a48ca2 --- /dev/null +++ b/shuffle_truncate.py | |||
| @@ -0,0 +1,102 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | import numpy | ||
| 4 | import os | ||
| 5 | import sys | ||
| 6 | import optparse | ||
| 7 | |||
| 8 | from util import load_binary_file | ||
| 9 | |||
| 10 | o = optparse.make_option | ||
| 11 | |||
| 12 | opts = [ | ||
| 13 | o('-c', '--cut-off', action='store', dest='cutoff', type='int', | ||
| 14 | help='max number of samples to use'), | ||
| 15 | |||
| 16 | o(None, '--count', action='store_true', dest='count', | ||
| 17 | help='just report the number of samples in each file'), | ||
| 18 | |||
| 19 | ] | ||
| 20 | |||
| 21 | defaults = { | ||
| 22 | 'cutoff' : None, | ||
| 23 | 'count' : False, | ||
| 24 | } | ||
| 25 | |||
| 26 | options = None | ||
| 27 | |||
| 28 | def load_files(fnames): | ||
| 29 | return [load_binary_file(f) for f in fnames] | ||
| 30 | |||
| 31 | def shuffle_truncate(arrays, fnames, target_length=None): | ||
| 32 | # Determine how many samples we can use. | ||
| 33 | if target_length: | ||
| 34 | shortest = target_length | ||
| 35 | else: | ||
| 36 | shortest = min([len(a) for a in arrays]) | ||
| 37 | print "Selecting %d samples from each data file." % shortest | ||
| 38 | |||
| 39 | # Make sure we'll select samples from all | ||
| 40 | # parts of the data file. | ||
| 41 | for a, n in zip(arrays, fnames): | ||
| 42 | if len(a) > shortest: | ||
| 43 | # Gotta be uniformly shuffled. | ||
| 44 | print "Shuffling %s ..." % n | ||
| 45 | numpy.random.shuffle(a) | ||
| 46 | |||
| 47 | # Now select the same number of samples from each file. | ||
| 48 | truncated = [a[:shortest] for a in arrays] | ||
| 49 | |||
| 50 | return truncated | ||
| 51 | |||
| 52 | def store_files(arrays, fnames): | ||
| 53 | for a, fn in zip(arrays, fnames): | ||
| 54 | print 'Storing %s.' % fn | ||
| 55 | fd = open(fn, 'wb') | ||
| 56 | a.tofile(fd) | ||
| 57 | fd.close() | ||
| 58 | |||
| 59 | def target_file(fname, want_ext): | ||
| 60 | f = os.path.basename(fname) | ||
| 61 | if not want_ext is None: | ||
| 62 | name, ext = os.path.splitext(f) | ||
| 63 | return "%s.%s" % (name, want_ext) | ||
| 64 | else: | ||
| 65 | return f | ||
| 66 | |||
| 67 | def shuffle_truncate_store(files, cutoff=None, ext='sbn'): | ||
| 68 | data = load_files(files) | ||
| 69 | trunc = shuffle_truncate(data, files, target_length=cutoff) | ||
| 70 | names = [target_file(f, ext) for f in files] | ||
| 71 | store_files(trunc, names) | ||
| 72 | |||
| 73 | def shuffle_truncate_store_individually(files, cutoff): | ||
| 74 | fmt = "%%0%dd" % len(str(len(files))) | ||
| 75 | for i, f in enumerate(files): | ||
| 76 | print ("[" + fmt + "/%d] %s") % (i+1, len(files), | ||
| 77 | os.path.basename(f)) | ||
| 78 | sys.stdout.flush() | ||
| 79 | shuffle_truncate_store([f], cutoff=cutoff) | ||
| 80 | |||
| 81 | def report_sample_counts(files): | ||
| 82 | fmt = "%%0%dd" % len(str(len(files))) | ||
| 83 | for i, f in enumerate(files): | ||
| 84 | d = load_binary_file(f) | ||
| 85 | print ("[" + fmt + "/%d] %8d %s") % (i+1, len(files), len(d), f) | ||
| 86 | sys.stdout.flush() | ||
| 87 | del d | ||
| 88 | |||
| 89 | if __name__ == '__main__': | ||
| 90 | parser = optparse.OptionParser(option_list=opts) | ||
| 91 | parser.set_defaults(**defaults) | ||
| 92 | (options, files) = parser.parse_args() | ||
| 93 | |||
| 94 | if not files: | ||
| 95 | print "Usage: shuffle_truncate_py data1.bin data2.bin data3.bin ..." | ||
| 96 | else: | ||
| 97 | if options.count: | ||
| 98 | report_sample_counts(files) | ||
| 99 | elif options.cutoff: | ||
| 100 | shuffle_truncate_store_individually(files, options.cutoff) | ||
| 101 | else: | ||
| 102 | shuffle_truncate_store(files) | ||
