support manual outlier filtering

author: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-22 17:12:40 -0500
committer: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2011-02-22 17:14:32 -0500
commit: 05a5ab2cd71e9c68a7002e1e8a89b887afc4240f (patch)
tree: 0e5f02468bfc960956d7d2cbbe93dfd03f530625
parent: 22e1c3b33984da853d1403843d9d2ce7a596c335 (diff)
2 files changed, 47 insertions, 4 deletions
diff --git a/binary_data.py b/binary_data.py
index 152afb7..39d6e05 100644
--- a/binary_data.py
+++ b/binary_data.py
@@ -5,7 +5,7 @@ from stats import iqr_remove_outliers, iqr_cutoff
 def get_data(fname, scale, extent, cutoff=None, maxval=1000.0,
-             stdev=False):
+             stdev=False, manual=None):
    data = load_binary_file(fname)
    if cutoff and len(data) > cutoff:
@@ -31,6 +31,9 @@ def get_data(fname, scale, extent, cutoff=None, maxval=1000.0,
    min_idx, max_idx = numpy.searchsorted(data, [lower, upper])
+    if manual:
+        max_idx -= manual
    return [data, max_idx, min_idx, upper, lower]
diff --git a/oplot.py b/oplot.py
index 63589b8..249d079 100755
--- a/oplot.py
+++ b/oplot.py
@@ -10,7 +10,7 @@ from binary_data import get_data
 from math import ceil
 import numpy
+import csv
 from os.path  import splitext, basename
 from optparse import make_option as o
@@ -35,6 +35,12 @@ options = [
    o('-c', '--cut-off', action='store', dest='cutoff', type='int',
      help='max number of samples to use'),
+    o('-t', '--take-off', action='store', dest='take_off', type='int',
+      help='manual number of outlier samples to discard'),
+    o('-o', '--outlier-list', action='store', dest='outlier_file',
+      help='list of outliers to remove'),
    o('-x', '--xmax', action='store', dest='xmax', type='int',
      help='determines x-axis range'),
@@ -57,11 +63,16 @@ defaults = {
    # data processing
    'cycles' : 2128, # per usec
-    'extent' : 3,
+    'extent' : 0,
    'cutoff' : None,
+    'take_off' : None,
    'normalize' : False,
    'use_std'  : False,
+    # manual outlier removal
+    'outlier_file' : None,
+    'outliers' : {},
    # formatting options
    'binsize' : 0.25,
@@ -86,6 +97,19 @@ HOST_CPUS = {
    'ludwig' : 24,
 }
+def load_outliers(fname):
+    outliers = {}
+    for row in csv.reader(open(fname, "r")):
+        sched = row[0]
+        n     = int(row[1])
+        cut   = int(row[2])
+        if not sched in outliers:
+            outliers[sched] = []
+        outliers[sched].append((n, cut))
+    return outliers
 def get_stats_label(samples):
    avg = numpy.mean(samples)
    med = numpy.median(samples)
@@ -172,11 +196,20 @@ class OverheadPlotter(defapp.App):
        else:
            scale = 1.0 / self.options.cycles
+        take_off = self.options.take_off
+        if conf['scheduler'] in self.options.outliers:
+            n = int(conf['n'])
+            for (i, t) in self.options.outliers[conf['scheduler']]:
+                if i == n:
+                    take_off = t
+                    break
        data, max_idx, min_idx, iqr_max, iqr_min = get_data(datafile,
                                                            scale,
                                                            extent=self.options.extent,
                                                            cutoff=self.options.cutoff,
-                                                            stdev=self.options.use_std)
+                                                            stdev=self.options.use_std,
+                                                            manual=take_off)
        samples = data[min_idx:max_idx]
        discarded = (len(data) - len(samples)) / float(len(data)) * 100
@@ -187,9 +220,13 @@ class OverheadPlotter(defapp.App):
        samples_label = "samples: total=%d filtered=%d (%.2f%%)" % \
            (len(data), len(data) -  len(samples), discarded)
        if self.options.extent:
            iqr_label = "IQR: extent=%d threshold=%.2fus" % \
                (self.options.extent, iqr_max)
+        elif take_off:
+            iqr_label = "%s outlier%s manually removed" % \
+                (take_off, '' if take_off == 1 else 's')
        elif discarded > 0:
            iqr_label = "manual threshold=1000us [IQR not applied]"
        else:
@@ -336,6 +373,9 @@ class OverheadPlotter(defapp.App):
        self.tmpfiles = []
    def default(self, _):
+        if self.options.outlier_file:
+            self.options.outliers = load_outliers(self.options.outlier_file)
        for i, datafile in enumerate(self.args):
            self.out("[%d/%d] Processing %s ..." % (i + 1, len(self.args), datafile))
            self.plot_file(datafile)
author	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-22 17:12:40 -0500
committer	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2011-02-22 17:14:32 -0500
commit	05a5ab2cd71e9c68a7002e1e8a89b887afc4240f (patch)
tree	0e5f02468bfc960956d7d2cbbe93dfd03f530625
parent	22e1c3b33984da853d1403843d9d2ce7a596c335 (diff)

diff --git a/binary_data.py b/binary_data.py index 152afb7..39d6e05 100644 --- a/binary_data.py +++ b/binary_data.py
@@ -5,7 +5,7 @@ from stats import iqr_remove_outliers, iqr_cutoff
5		5
6		6
7	def get_data(fname, scale, extent, cutoff=None, maxval=1000.0,	7	def get_data(fname, scale, extent, cutoff=None, maxval=1000.0,
8	stdev=False):	8	stdev=False, manual=None):
9	data = load_binary_file(fname)	9	data = load_binary_file(fname)
10		10
11	if cutoff and len(data) > cutoff:	11	if cutoff and len(data) > cutoff:
@@ -31,6 +31,9 @@ def get_data(fname, scale, extent, cutoff=None, maxval=1000.0,
31		31
32	min_idx, max_idx = numpy.searchsorted(data, [lower, upper])	32	min_idx, max_idx = numpy.searchsorted(data, [lower, upper])
33		33
		34	if manual:
		35	max_idx -= manual
		36
34	return [data, max_idx, min_idx, upper, lower]	37	return [data, max_idx, min_idx, upper, lower]
35		38
36		39


diff --git a/oplot.py b/oplot.py index 63589b8..249d079 100755 --- a/oplot.py +++ b/oplot.py
@@ -10,7 +10,7 @@ from binary_data import get_data
10	from math import ceil	10	from math import ceil
11		11
12	import numpy	12	import numpy
13		13	import csv
14	from os.path import splitext, basename	14	from os.path import splitext, basename
15	from optparse import make_option as o	15	from optparse import make_option as o
16		16
@@ -35,6 +35,12 @@ options = [
35	o('-c', '--cut-off', action='store', dest='cutoff', type='int',	35	o('-c', '--cut-off', action='store', dest='cutoff', type='int',
36	help='max number of samples to use'),	36	help='max number of samples to use'),
37		37
		38	o('-t', '--take-off', action='store', dest='take_off', type='int',
		39	help='manual number of outlier samples to discard'),
		40
		41	o('-o', '--outlier-list', action='store', dest='outlier_file',
		42	help='list of outliers to remove'),
		43
38	o('-x', '--xmax', action='store', dest='xmax', type='int',	44	o('-x', '--xmax', action='store', dest='xmax', type='int',
39	help='determines x-axis range'),	45	help='determines x-axis range'),
40		46
@@ -57,11 +63,16 @@ defaults = {
57		63
58	# data processing	64	# data processing
59	'cycles' : 2128, # per usec	65	'cycles' : 2128, # per usec
60	'extent' : 3,	66	'extent' : 0,
61	'cutoff' : None,	67	'cutoff' : None,
		68	'take_off' : None,
62	'normalize' : False,	69	'normalize' : False,
63	'use_std' : False,	70	'use_std' : False,
64		71
		72	# manual outlier removal
		73	'outlier_file' : None,
		74	'outliers' : {},
		75
65	# formatting options	76	# formatting options
66	'binsize' : 0.25,	77	'binsize' : 0.25,
67		78
@@ -86,6 +97,19 @@ HOST_CPUS = {
86	'ludwig' : 24,	97	'ludwig' : 24,
87	}	98	}
88		99
		100
		101	def load_outliers(fname):
		102	outliers = {}
		103	for row in csv.reader(open(fname, "r")):
		104	sched = row[0]
		105	n = int(row[1])
		106	cut = int(row[2])
		107	if not sched in outliers:
		108	outliers[sched] = []
		109	outliers[sched].append((n, cut))
		110	return outliers
		111
		112
89	def get_stats_label(samples):	113	def get_stats_label(samples):
90	avg = numpy.mean(samples)	114	avg = numpy.mean(samples)
91	med = numpy.median(samples)	115	med = numpy.median(samples)
@@ -172,11 +196,20 @@ class OverheadPlotter(defapp.App):
172	else:	196	else:
173	scale = 1.0 / self.options.cycles	197	scale = 1.0 / self.options.cycles
174		198
		199	take_off = self.options.take_off
		200	if conf['scheduler'] in self.options.outliers:
		201	n = int(conf['n'])
		202	for (i, t) in self.options.outliers[conf['scheduler']]:
		203	if i == n:
		204	take_off = t
		205	break
		206
175	data, max_idx, min_idx, iqr_max, iqr_min = get_data(datafile,	207	data, max_idx, min_idx, iqr_max, iqr_min = get_data(datafile,
176	scale,	208	scale,
177	extent=self.options.extent,	209	extent=self.options.extent,
178	cutoff=self.options.cutoff,	210	cutoff=self.options.cutoff,
179	stdev=self.options.use_std)	211	stdev=self.options.use_std,
		212	manual=take_off)
180		213
181	samples = data[min_idx:max_idx]	214	samples = data[min_idx:max_idx]
182	discarded = (len(data) - len(samples)) / float(len(data)) * 100	215	discarded = (len(data) - len(samples)) / float(len(data)) * 100
@@ -187,9 +220,13 @@ class OverheadPlotter(defapp.App):
187	samples_label = "samples: total=%d filtered=%d (%.2f%%)" % \	220	samples_label = "samples: total=%d filtered=%d (%.2f%%)" % \
188	(len(data), len(data) - len(samples), discarded)	221	(len(data), len(data) - len(samples), discarded)
189		222
		223
190	if self.options.extent:	224	if self.options.extent:
191	iqr_label = "IQR: extent=%d threshold=%.2fus" % \	225	iqr_label = "IQR: extent=%d threshold=%.2fus" % \
192	(self.options.extent, iqr_max)	226	(self.options.extent, iqr_max)
		227	elif take_off:
		228	iqr_label = "%s outlier%s manually removed" % \
		229	(take_off, '' if take_off == 1 else 's')
193	elif discarded > 0:	230	elif discarded > 0:
194	iqr_label = "manual threshold=1000us [IQR not applied]"	231	iqr_label = "manual threshold=1000us [IQR not applied]"
195	else:	232	else:
@@ -336,6 +373,9 @@ class OverheadPlotter(defapp.App):
336	self.tmpfiles = []	373	self.tmpfiles = []
337		374
338	def default(self, _):	375	def default(self, _):
		376	if self.options.outlier_file:
		377	self.options.outliers = load_outliers(self.options.outlier_file)
		378
339	for i, datafile in enumerate(self.args):	379	for i, datafile in enumerate(self.args):
340	self.out("[%d/%d] Processing %s ..." % (i + 1, len(self.args), datafile))	380	self.out("[%d/%d] Processing %s ..." % (i + 1, len(self.args), datafile))
341	self.plot_file(datafile)	381	self.plot_file(datafile)