add scripts for shuffling and truncating sample files

author: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-21 15:14:15 -0500
committer: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-21 15:14:15 -0500
commit: 95ce3338ad1208ee44f334b87fea2c32a7b888d4 (patch)
tree: 750eed2557a482e506fc620e34133d136ee7c45d
parent: 60f818697e2cee718c75e78999945fe2b628c72b (diff)
3 files changed, 137 insertions, 0 deletions
diff --git a/count-all b/count-all
new file mode 100755
index 0000000..7a353bc
--- /dev/null
+++ b/count-all
@@ -0,0 +1,17 @@
+#!/bin/bash
+EVENTS="SCHED SCHED2 TICK CXS RELEASE RELEASE-LATENCY SEND-RESCHED"
+DIR="$1"
+if [ -z "$DIR" ]
+then
+    DIR=.
+fi
+for E in $EVENTS
+do
+    FILES=`find "$DIR" -iname "*overhead=${E}.bin"`
+    shuffle_truncate.py --count $FILES > counts_overhead=${E}.txt
+done
diff --git a/shuffle-truncate-all b/shuffle-truncate-all
new file mode 100755
index 0000000..b83c360
--- /dev/null
+++ b/shuffle-truncate-all
@@ -0,0 +1,18 @@
+#!/bin/bash
+COUNTS="$1"
+DIR="$2"
+if [ -z "$DIR" ]
+then
+    DIR=.
+fi
+EVENTS=`awk '{print $1}' $COUNTS`
+for E in $EVENTS
+do
+    CUTOFF=`grep $E $COUNTS | awk '{print $2}'`
+    FILES=`find "$DIR" -iname "*overhead=${E}.bin"`
+    shuffle_truncate.py -c $CUTOFF $FILES
+done
diff --git a/shuffle_truncate.py b/shuffle_truncate.py
new file mode 100755
index 0000000..6a48ca2
--- /dev/null
+++ b/shuffle_truncate.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+import numpy
+import os
+import sys
+import optparse
+from util import load_binary_file
+o = optparse.make_option
+opts = [
+    o('-c', '--cut-off', action='store', dest='cutoff', type='int',
+      help='max number of samples to use'),
+    o(None, '--count', action='store_true', dest='count',
+      help='just report the number of samples in each file'),
+    ]
+defaults = {
+    'cutoff'  : None,
+    'count'   : False,
+    }
+options = None
+def load_files(fnames):
+    return [load_binary_file(f) for f in fnames]
+def shuffle_truncate(arrays, fnames, target_length=None):
+    # Determine how many samples we can use.
+    if target_length:
+        shortest = target_length
+    else:
+        shortest = min([len(a) for a in arrays])
+        print "Selecting %d samples from each data file." % shortest
+    # Make sure we'll select samples from all
+    # parts of the data file.
+    for a, n in zip(arrays, fnames):
+        if len(a) > shortest:
+            # Gotta be uniformly shuffled.
+            print "Shuffling %s ..." % n
+            numpy.random.shuffle(a)
+    # Now select the same number of samples from each file.
+    truncated = [a[:shortest] for a in arrays]
+    return truncated
+def store_files(arrays, fnames):
+    for a, fn in zip(arrays, fnames):
+        print 'Storing %s.' % fn
+        fd = open(fn, 'wb')
+        a.tofile(fd)
+        fd.close()
+def target_file(fname, want_ext):
+    f = os.path.basename(fname)
+    if not want_ext is None:
+        name, ext = os.path.splitext(f)
+        return "%s.%s" % (name, want_ext)
+    else:
+        return f
+def shuffle_truncate_store(files, cutoff=None, ext='sbn'):
+    data  = load_files(files)
+    trunc = shuffle_truncate(data, files, target_length=cutoff)
+    names = [target_file(f, ext) for f in files]
+    store_files(trunc, names)
+def shuffle_truncate_store_individually(files, cutoff):
+    fmt = "%%0%dd" % len(str(len(files)))
+    for i, f in enumerate(files):
+        print ("["  + fmt + "/%d] %s") % (i+1, len(files),
+                                          os.path.basename(f))
+        sys.stdout.flush()
+        shuffle_truncate_store([f], cutoff=cutoff)
+def report_sample_counts(files):
+    fmt = "%%0%dd" % len(str(len(files)))
+    for i, f in enumerate(files):
+        d = load_binary_file(f)
+        print ("["  + fmt + "/%d] %8d %s") % (i+1, len(files), len(d), f)
+        sys.stdout.flush()
+        del d
+if __name__ == '__main__':
+    parser = optparse.OptionParser(option_list=opts)
+    parser.set_defaults(**defaults)
+    (options, files) = parser.parse_args()
+    if not files:
+        print "Usage: shuffle_truncate_py data1.bin data2.bin data3.bin ..."
+    else:
+        if options.count:
+            report_sample_counts(files)
+        elif options.cutoff:
+            shuffle_truncate_store_individually(files, options.cutoff)
+        else:
+            shuffle_truncate_store(files)
author	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-21 15:14:15 -0500
committer	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-21 15:14:15 -0500
commit	95ce3338ad1208ee44f334b87fea2c32a7b888d4 (patch)
tree	750eed2557a482e506fc620e34133d136ee7c45d
parent	60f818697e2cee718c75e78999945fe2b628c72b (diff)

diff --git a/count-all b/count-all new file mode 100755 index 0000000..7a353bc --- /dev/null +++ b/count-all
@@ -0,0 +1,17 @@
	1	#!/bin/bash
	2
	3	EVENTS="SCHED SCHED2 TICK CXS RELEASE RELEASE-LATENCY SEND-RESCHED"
	4
	5	DIR="$1"
	6
	7	if [ -z "$DIR" ]
	8	then
	9	DIR=.
	10	fi
	11
	12	for E in $EVENTS
	13	do
	14	FILES=`find "$DIR" -iname "*overhead=${E}.bin"`
	15	shuffle_truncate.py --count $FILES > counts_overhead=${E}.txt
	16	done
	17


diff --git a/shuffle-truncate-all b/shuffle-truncate-all new file mode 100755 index 0000000..b83c360 --- /dev/null +++ b/shuffle-truncate-all
@@ -0,0 +1,18 @@
	1	#!/bin/bash
	2
	3	COUNTS="$1"
	4	DIR="$2"
	5
	6	if [ -z "$DIR" ]
	7	then
	8	DIR=.
	9	fi
	10
	11	EVENTS=`awk '{print $1}' $COUNTS`
	12
	13	for E in $EVENTS
	14	do
	15	CUTOFF=`grep $E $COUNTS \| awk '{print $2}'`
	16	FILES=`find "$DIR" -iname "*overhead=${E}.bin"`
	17	shuffle_truncate.py -c $CUTOFF $FILES
	18	done


diff --git a/shuffle_truncate.py b/shuffle_truncate.py new file mode 100755 index 0000000..6a48ca2 --- /dev/null +++ b/shuffle_truncate.py
@@ -0,0 +1,102 @@
	1	#!/usr/bin/env python
	2
	3	import numpy
	4	import os
	5	import sys
	6	import optparse
	7
	8	from util import load_binary_file
	9
	10	o = optparse.make_option
	11
	12	opts = [
	13	o('-c', '--cut-off', action='store', dest='cutoff', type='int',
	14	help='max number of samples to use'),
	15
	16	o(None, '--count', action='store_true', dest='count',
	17	help='just report the number of samples in each file'),
	18
	19	]
	20
	21	defaults = {
	22	'cutoff' : None,
	23	'count' : False,
	24	}
	25
	26	options = None
	27
	28	def load_files(fnames):
	29	return [load_binary_file(f) for f in fnames]
	30
	31	def shuffle_truncate(arrays, fnames, target_length=None):
	32	# Determine how many samples we can use.
	33	if target_length:
	34	shortest = target_length
	35	else:
	36	shortest = min([len(a) for a in arrays])
	37	print "Selecting %d samples from each data file." % shortest
	38
	39	# Make sure we'll select samples from all
	40	# parts of the data file.
	41	for a, n in zip(arrays, fnames):
	42	if len(a) > shortest:
	43	# Gotta be uniformly shuffled.
	44	print "Shuffling %s ..." % n
	45	numpy.random.shuffle(a)
	46
	47	# Now select the same number of samples from each file.
	48	truncated = [a[:shortest] for a in arrays]
	49
	50	return truncated
	51
	52	def store_files(arrays, fnames):
	53	for a, fn in zip(arrays, fnames):
	54	print 'Storing %s.' % fn
	55	fd = open(fn, 'wb')
	56	a.tofile(fd)
	57	fd.close()
	58
	59	def target_file(fname, want_ext):
	60	f = os.path.basename(fname)
	61	if not want_ext is None:
	62	name, ext = os.path.splitext(f)
	63	return "%s.%s" % (name, want_ext)
	64	else:
	65	return f
	66
	67	def shuffle_truncate_store(files, cutoff=None, ext='sbn'):
	68	data = load_files(files)
	69	trunc = shuffle_truncate(data, files, target_length=cutoff)
	70	names = [target_file(f, ext) for f in files]
	71	store_files(trunc, names)
	72
	73	def shuffle_truncate_store_individually(files, cutoff):
	74	fmt = "%%0%dd" % len(str(len(files)))
	75	for i, f in enumerate(files):
	76	print ("[" + fmt + "/%d] %s") % (i+1, len(files),
	77	os.path.basename(f))
	78	sys.stdout.flush()
	79	shuffle_truncate_store([f], cutoff=cutoff)
	80
	81	def report_sample_counts(files):
	82	fmt = "%%0%dd" % len(str(len(files)))
	83	for i, f in enumerate(files):
	84	d = load_binary_file(f)
	85	print ("[" + fmt + "/%d] %8d %s") % (i+1, len(files), len(d), f)
	86	sys.stdout.flush()
	87	del d
	88
	89	if __name__ == '__main__':
	90	parser = optparse.OptionParser(option_list=opts)
	91	parser.set_defaults(**defaults)
	92	(options, files) = parser.parse_args()
	93
	94	if not files:
	95	print "Usage: shuffle_truncate_py data1.bin data2.bin data3.bin ..."
	96	else:
	97	if options.count:
	98	report_sample_counts(files)
	99	elif options.cutoff:
	100	shuffle_truncate_store_individually(files, options.cutoff)
	101	else:
	102	shuffle_truncate_store(files)