From 5e5fc221d60f46951a360f3e4f637e0edc084daf Mon Sep 17 00:00:00 2001
From: "Bjoern B. Brandenburg" <bbb@cs.unc.edu>
Date: Sat, 19 Feb 2011 21:45:54 -0500
Subject: improve data and visualization mangling in oplot.py

Also, avoid duplicating binary data parsing code.
---
 binary_data.py | 18 ++++++++++++++++--
 oplot.py       | 60 ++++++++++++++++++++++++++++++++++++----------------------
 2 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/binary_data.py b/binary_data.py
index d4be159..6108816 100644
--- a/binary_data.py
+++ b/binary_data.py
@@ -3,18 +3,32 @@ import numpy
 from util import load_binary_file
 from stats import iqr_remove_outliers, iqr_cutoff
 
-def compact_file(fname, scale=None, extend=1.5):
+
+def get_data(fname, scale, extent, cutoff=None, maxval=1000.0):
     data = load_binary_file(fname)
 
+    if cutoff and len(data) > cutoff:
+        data = data[:cutoff]
+
     if not scale is None:
         data *= scale
 
     data.sort()
 
-    iqr_min, iqr_max = iqr_cutoff(data, extend)
+    if extent:
+        iqr_min, iqr_max = iqr_cutoff(data, extent)
+    else:
+        iqr_min = 0
+        iqr_max = maxval
 
     min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max])
 
+    return [data, max_idx, min_idx, iqr_max, iqr_min]
+
+
+def compact_file(*args, **kargs):
+    data, max_idx, min_idx, iqr_max, iqr_min = get_data(*args, **kargs)
+
     samples = data[min_idx:max_idx]
 
     filtered = len(data) - len(samples)
diff --git a/oplot.py b/oplot.py
index e0b4c69..89e0ef0 100755
--- a/oplot.py
+++ b/oplot.py
@@ -5,6 +5,8 @@ from plot import decode
 from util import load_csv_file, load_binary_file, write_csv_file
 from stats import iqr_cutoff
 
+from binary_data import get_data
+
 from math import ceil
 
 import numpy
@@ -24,9 +26,21 @@ options = [
     o('-i', '--iqr-extent', action='store', dest='extent', type='float',
       help='what extent to use for outlier removal'),
 
+    o('-n', '--normalize', action='store_true', dest='normalize',
+      help='use normalize counts'),
+
     o('-c', '--cut-off', action='store', dest='cutoff', type='int',
       help='max number of samples to use'),
 
+    o('-x', '--xmax', action='store', dest='xmax', type='int',
+      help='determines x-axis range'),
+
+    o('-y', '--ymax', action='store', dest='ymax', type='float',
+      help='determines y-axis range'),
+
+    o('-b', '--binsize', action='store', dest='binsize', type='float',
+      help='set binsize of histogram'),
+
     ]
 
 defaults = {
@@ -39,9 +53,14 @@ defaults = {
     'cycles' : 2128, # per usec
     'extent' : 3,
     'cutoff' : None,
+    'normalize' : False,
 
     # formatting options
     'binsize' : 0.25,
+
+    'xmax'    : None,
+    'ymax'    : None,
+
     }
 
 
@@ -55,19 +74,6 @@ TXT = {
     'TICK' : 'timer tick overhead',
 }
 
-def get_data(fname, scale, extend):
-    data = load_binary_file(fname)
-
-    if not scale is None:
-        data *= scale
-
-    data.sort()
-
-    iqr_min, iqr_max = iqr_cutoff(data, extend)
-    min_idx, max_idx = numpy.searchsorted(data, [iqr_min, iqr_max])
-
-    return [data, max_idx, min_idx, iqr_max, iqr_min]
-
 
 def get_stats_label(samples):
     avg = numpy.mean(samples)
@@ -114,6 +120,8 @@ class OverheadPlotter(defapp.App):
 
     def write_histogram(self, samples, name, labels=10):
         max = ceil(numpy.amax(samples))
+        if self.options.xmax:
+            max = self.options.xmax
         bin_size = self.options.binsize
         num_bins  = int(max / bin_size)
         (bins, edges) = numpy.histogram(samples, bins=num_bins,
@@ -127,6 +135,10 @@ class OverheadPlotter(defapp.App):
             cumulative += bins[i]
             data[i, 2]  = cumulative
 
+        if self.options.normalize:
+            data[:, 1] /= len(samples)
+            data[:, 2] /= len(samples)
+
         label_rate = len(bins) / labels
         if not label_rate:
             label_rate = 1
@@ -135,7 +147,7 @@ class OverheadPlotter(defapp.App):
             label = '%.2f' % row[0] if i % label_rate == 0 else ''
             for_file.append([row[0], row[1], row[2], label])
 
-        return (data, self.write(for_file, name, ext='hist'))
+        return (data, self.write(for_file, name, ext='hist'), edges)
 
     def render(self, p):
         if self.options.save_script:
@@ -151,10 +163,8 @@ class OverheadPlotter(defapp.App):
 
         data, max_idx, min_idx, iqr_max, iqr_min = get_data(datafile,
                                                             scale,
-                                                            self.options.extent)
-
-        if self.options.cutoff and len(data) > self.options.cutoff:
-            data = data[:self.options.cutoff]
+                                                            self.options.extent,
+                                                            self.options.cutoff)
 
         samples = data[min_idx:max_idx]
         discarded = (len(data) - len(samples)) / float(len(data)) * 100
@@ -175,20 +185,24 @@ class OverheadPlotter(defapp.App):
                     label(0.98, 0.95, iqr_label,
                           coord=['graph', 'graph'], align='right')]
 
-        (hist, fname) = self.write_histogram(samples, name)
+        (hist, fname, edges) = self.write_histogram(samples, name)
 
         p.setup_histogram(gap=1, boxwidth=1.0)
 
         p.title = "%s: measured %s for %s tasks per processor (host=%s)" \
             % (conf['scheduler'], TXT[conf['overhead']], conf['n'], conf['host'])
 
-        p.ylabel = "number of samples"
+        if self.options.normalize:
+            p.ylabel = "fraction of samples"
+        else:
+            p.ylabel = "number of samples"
         p.xlabel = "overhead in microseconds (bin size = %.2fus)" \
             % self.options.binsize
-#            p.xrange = (0, ceil(max_cost))
+
+        if self.options.ymax:
+            p.yrange = (0, self.options.ymax)
+#        p.yrange = (0, (ceil(numpy.amax(hist[:,1]) / 100.0) * 100))
         p.xticks = (0, 10)
-#            p.yticks = (0, 1)
-        p.yrange = (0, (ceil(numpy.amax(hist[:,1]) / 100.0) * 100))
         p.curves = [curve(histogram=fname, col=2, labels_col=4)]
 
         #### Styling.
-- 
cgit v1.2.2