smt_analysis/computeSMTslowdown.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

#!/usr/bin/python3
from typing import List, Any
import numpy as np
from scipy import stats
import sys
import plotille.plotille as plt
TIMING_ERROR = 1000 #ns
LEVEL_C_ANALYSIS = False
from libSMT import *

def print_usage():
    print("This program takes in the all-pairs and baseline SMT data and computes how much each program is slowed when SMT in enabled.", file=sys.stderr)
    print("Level-A/B usage: {} <file -A> <file -B> <baseline file> --cij".format(sys.argv[0]), file=sys.stderr)
    print("Level-C usage: {} <continuous pairs> <baseline file>".format(sys.argv[0]), file=sys.stderr)

# Check that we got the right number of parameters
if len(sys.argv) < 3:
    print_usage()
    exit()

if len(sys.argv) > 3:
    print("Analyzing results using Level-A/B methodology...")
else:
    print("Analyzing results using Level-C methodology...")
    LEVEL_C_ANALYSIS = True

assert_valid_input_files(sys.argv[1:-1], print_usage);

# Print Cij values rather than Mij
TIMES_ONLY = len(sys.argv) > 4 and "--cij" in sys.argv[4]
OK_PAIRS_ONLY = len(sys.argv) > 4 and "--cij-ok" in sys.argv[4]

# Pull in the data
if not LEVEL_C_ANALYSIS:
    baseline_times, baseline_sample_cnt, baseline_max_times = load_baseline(sys.argv[3])
    paired_times, paired_offsets, name_to_idx, idx_to_name = load_paired(sys.argv[1], sys.argv[2], len(list(baseline_times.keys())))
    for key in baseline_times:
        print(key,max(baseline_times[key]))
else:
    # Paired times use an abuse of the baseline file format
    baseline_times, baseline_sample_cnt, baseline_max_times = load_baseline(sys.argv[2])
    paired_times, name_to_idx, idx_to_name = load_fake_paired(sys.argv[1])

# We work iff the baseline was run for the same set of benchmarks as the pairs were
assert_base_and_pair_keys_match(baseline_times, name_to_idx)

# Only consider benchmarks that are at least an order of magnitude longer than the timing error
reliableNames = []
for i in range(0, len(name_to_idx)):
    benchmark = idx_to_name[i]
    if min(baseline_times[benchmark]) > TIMING_ERROR * 10:
        reliableNames.append(benchmark)

# Compute SMT slowdown for each benchmark
# Output format: table, each row is one benchmark and each column is one benchmark
#                each cell is base1 + base2*m = pair solved for m, aka (pair - base1) / base2
# Print table header
print("Bench          ", end=" ")
for name in reliableNames:
    if not TIMES_ONLY: print("{:<10.10}".format(name), end=" ")
    if TIMES_ONLY: print("{:<12.12}".format(name), end=" ")
print()
# Print rows
sample_f = max # Change this to np.mean to use mean values in Mij generation
M_vals = []
for b1 in reliableNames:
    if not TIMES_ONLY: print("{:<14.14}:".format(b1), end=" ")
    if TIMES_ONLY: print("{:<14.14}:".format(b1), end=" ")
    for b2 in reliableNames:
        if not LEVEL_C_ANALYSIS:
            Ci = max(sample_f(baseline_times[b1]), sample_f(baseline_times[b2]))
            Cj = min(sample_f(baseline_times[b1]), sample_f(baseline_times[b2]))
            Cij = sample_f(paired_times[name_to_idx[b1]][name_to_idx[b2]])
            if False:
                M = np.std(paired_times[name_to_idx[b1]][name_to_idx[b2]]) / np.mean(paired_times[name_to_idx[b1]][name_to_idx[b2]])
            else:
                M = (Cij - Ci) / Cj
            if Cij and Cj * 10 > Ci: # We don't pair tasks with more than a 10x difference in length
                M_vals.append(M)
                if not TIMES_ONLY: print("{:>10.3}".format(M), end=" ")
            else:
                if not TIMES_ONLY: print("{:>10}".format("N/A"), end=" ")

            if TIMES_ONLY and (not OK_PAIRS_ONLY or Cj * 10 > Ci):
                print("{:>12}".format(Cij), end=" ")
            elif OK_PAIRS_ONLY and Cj * 10 <= Ci:
                print("{:>12}".format("0"), end=" ")

        else:
            time_with_smt = sample_f(paired_times[name_to_idx[b1]][name_to_idx[b2]])
            time_wout_smt = sample_f(baseline_times[b1])
            M = time_with_smt / time_wout_smt
            M_vals.append(M)
            print("{:>10.3}".format(M), end=" ")
    print("")
# Print some statistics about the distribution
print("Average: {:>5.3} with standard deviation {:>5.3} using `{}`".format(np.mean(M_vals), np.std(M_vals), sample_f.__name__))
Ms = np.asarray(M_vals, dtype=np.float32)
if not LEVEL_C_ANALYSIS:
    print(np.sum(Ms <= 0), "of", len(M_vals), "M_i:j values are at most zero -", 100*np.sum(Ms <= 0)/len(M_vals), "percent")
    print(np.sum(Ms > 1), "of", len(M_vals), "M_i:j values are greater than one -", 100*np.sum(Ms > 1)/len(M_vals), "percent")
    M_vals_to_plot = Ms[np.logical_and(Ms > 0, Ms <= 1)]
else:
    print(np.sum(Ms <= 1), "of", len(M_vals), "M_i:j values are at most one -", 100*np.sum(Ms <= 1)/len(M_vals), "percent")
    print(np.sum(Ms > 2), "of", len(M_vals), "M_i:j values are greater than two -", 100*np.sum(Ms > 2)/len(M_vals), "percent")
    M_vals_to_plot = Ms

print("Using Sim's analysis, average: {:>5.3} with standard deviation {:>5.3} using `{}`".format(np.mean(list(M_vals_to_plot)), np.std(list(M_vals_to_plot)), sample_f.__name__))
print(plt.hist(M_vals_to_plot, bins=10))

##### BELOW TEXT IS OLD OFFSET CODE (patched) #####
## This still works, but is hacky and deprecated ##
## PearsonR doesn't work though                  ##
if not LEVEL_C_ANALYSIS and False:
    benchmarkNames = idx_to_name
    benchmarkCount = len(benchmarkNames)
    numJobs = len(paired_times[0][0])

    reliableNames=["ndes", "cjpeg_wrbmp", "adpcm_enc", "cjpeg_transupp", "epic", "gsm_dec", "h264_dec", "huff_enc", "rijndael_enc", "rijndael_dec", "gsm_enc", "ammunition", "mpeg2"]

    #stats.pearsonr(time[b1][b2], oList),

    with open("weakRelPairs_offset.csv", mode="w+") as f3:
        print("Benchmark1", "Benchmark2", "minOffset", "maxOffset", "meanOffset", "meddOffset", "stdOffset", "minLength", "maxLength", sep=",", file=f3)
        for b1 in range (0, benchmarkCount):
            for b2 in range (0, benchmarkCount):
                if benchmarkNames[b1] in reliableNames and benchmarkNames[b2] in reliableNames:
                    #exclude last job due to inccurate timing
                    oList = paired_offsets[b1][b2][:numJobs-1]
                    jList = paired_times[b1][b2][:numJobs-1]
#                   plt.scatter(oList, jList)
#                   plt.title(benchmarkNames[b1] + ", " + benchmarkNames[b2])
#                   plt.show()
#                   print(benchmarkNames[b1], benchmarkNames[b2], min(oList), max(oList), np.mean(oList), np.median(oList), np.std(oList), stats.pearsonr(jList, oList), stats.spearmanr(jList, oList),  sep=",", file=f3)
                    print(benchmarkNames[b1], benchmarkNames[b2], min(oList), max(oList), np.mean(oList), np.median(oList), np.std(oList), min(jList), max(jList),  sep=",", file=f3)
"""
#with open("reliableGraphs.csv", mode="x") as f3:
        for b1 in range(0, benchmarkCount):
            for b2 in range(0, benchmarkCount):
                if benchmarkNames[b1] in reliableNames and benchmarkNames[b2] in reliableNames:
                    oList = offset[b1][b2][:numJobs - 1]
                    jList=time[b1][b2][:numJobs-1]
                    # offset, time scatterplot
                    plt.scatter(oList, jList)
                    plt.title(benchmarkNames[b1] + " " + benchmarkNames[b2] + " Offsets v. Time")
                    plt.show()
                    #time histogram
                    #plt.hist(jList, bins=10)
                    #plt.title(benchmarkNames[b1] + benchmarkNames[b2] + "Completion Times")
                    #plt.show()
                    #offset histogram
                    #plt.hist(oList, bins=10)
                    #plt.title(benchmarkNames[b1] + benchmarkNames[b2] + "Offsets")
                    #plt.show()
"""