aboutsummaryrefslogtreecommitdiffstats
path: root/litmus/gpu_affinity.c
diff options
context:
space:
mode:
Diffstat (limited to 'litmus/gpu_affinity.c')
-rw-r--r--litmus/gpu_affinity.c238
1 files changed, 121 insertions, 117 deletions
diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c
index 7d73105b4181..7a2b5005be45 100644
--- a/litmus/gpu_affinity.c
+++ b/litmus/gpu_affinity.c
@@ -9,33 +9,10 @@
9 9
10#define OBSERVATION_CAP ((lt_t)(2e9)) 10#define OBSERVATION_CAP ((lt_t)(2e9))
11 11
12// reason for skew: high outliers are less 12#define MIN_OBSERVATIONS 10
13// frequent and way out of bounds
14//#define HI_THRESHOLD 2
15//#define LO_THRESHOLD 4
16
17#define NUM_STDEV_NUM 1
18#define NUM_STDEV_DENOM 2
19 13
20#define MIN(a, b) ((a < b) ? a : b) 14#define MIN(a, b) ((a < b) ? a : b)
21 15
22static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
23{
24 fp_t relative_err;
25 fp_t err, new;
26 fp_t actual = _integer_to_fp(observed);
27
28 err = _sub(actual, fb->est);
29 new = _add(_mul(a, err), _mul(b, fb->accum_err));
30
31 relative_err = _div(err, actual);
32
33 fb->est = new;
34 fb->accum_err = _add(fb->accum_err, err);
35
36 return relative_err;
37}
38
39lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count) 16lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)
40{ 17{
41 /* brute force: takes about as much time as incremental running methods when 18 /* brute force: takes about as much time as incremental running methods when
@@ -74,127 +51,154 @@ lt_t isqrt(lt_t n)
74 return res; 51 return res;
75} 52}
76 53
77void update_gpu_estimate(struct task_struct *t, lt_t observed) 54static lt_t avg_method(struct task_struct *t, lt_t observed)
78{ 55{
79 //feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]); 56 avg_est_t *est = &(tsk_rt(t)->gpu_avg_est[tsk_rt(t)->gpu_migration]);
80 avg_est_t *est; 57
81 struct migration_info mig_info; 58 /* cut-off */
82 59 if (tsk_rt(t)->prediction_mode != SIMPLE_AVG &&
83 BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST); 60 est->history.count >= MIN_OBSERVATIONS) {
61 lt_t upper = est->center_line + est->std * est->sigmas;
62 lt_t lower = est->center_line - est->std * est->sigmas;
63 if (unlikely(observed < lower || observed > upper)) {
64 TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
65 observed, lower, upper);
66 return 0;
67 }
68 }
84 69
85 est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]); 70 if (unlikely(est->history.count < est->history.window_size)) {
71 ++est->history.count;
72 }
73 else {
74 est->sum -= est->history.observations[est->history.idx];
86 75
87 if (unlikely(observed > OBSERVATION_CAP)) { 76 if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
88 TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n", 77 est->rsum -= est->history.range[est->history.ridx];
89 observed, 78 }
90 OBSERVATION_CAP);
91 return;
92 } 79 }
93 80
94#if 0 81 est->history.observations[est->history.idx] = observed;
95 // filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out 82 est->sum += observed;
96 // of range of the average, but only filter if enough samples 83 est->center_line = est->sum/est->history.count;
97 // have been taken. 84
98 if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) { 85 if (tsk_rt(t)->prediction_mode == CC_BRUTE_AVG) {
99 if (unlikely(observed < est->avg/LO_THRESHOLD)) { 86 if (likely(est->history.count > 1)) {
100 TRACE_TASK(t, "Observation is too small: %llu\n", 87 est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
101 observed);
102 return;
103 } 88 }
104 else if (unlikely(observed > est->avg*HI_THRESHOLD)) { 89 }
105 TRACE_TASK(t, "Observation is too large: %llu\n", 90 else if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
106 observed); 91 if (likely(est->history.count > 1)) {
107 return; 92 lt_t mr;
93 est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
94 observed - est->history.last_observed :
95 est->history.last_observed - observed;
96 est->rsum += est->history.range[est->history.ridx];
97 mr = est->rsum / (est->history.count - 1);
98 est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
99 est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
108 } 100 }
109#endif 101 est->history.last_observed = observed;
110 // filter values outside NUM_STDEVx the standard deviation, 102 }
111 // but only filter if enough samples have been taken.
112 if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
113 lt_t lower, upper;
114 103
115 lt_t range = (est->std*NUM_STDEV_NUM)/NUM_STDEV_DENOM; 104 est->history.idx = (est->history.idx + 1) % est->history.window_size;
116 lower = est->avg - MIN(range, est->avg); // no underflow.
117 105
118 if (unlikely(observed < lower)) { 106 return est->center_line;
119 TRACE_TASK(t, "Observation is too small: %llu\n", observed); 107}
120 return;
121 }
122 108
123 upper = est->avg + range; 109static lt_t ewma_method(struct task_struct *t, lt_t observed)
124 if (unlikely(observed > upper)) { 110{
125 TRACE_TASK(t, "Observation is too large: %llu\n", observed); 111 ewma_est_t *est = &(tsk_rt(t)->gpu_ewma_est[tsk_rt(t)->gpu_migration]);
126 return; 112
113 /* cut-off */
114 if (tsk_rt(t)->prediction_mode != SIMPLE_EWMA &&
115 est->history.count >= MIN_OBSERVATIONS) {
116 lt_t upper = est->center_line + est->std * est->sigmas;
117 lt_t lower = est->center_line - est->std * est->sigmas;
118 if (unlikely(observed < lower || observed > upper)) {
119 TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
120 observed, lower, upper);
121 return 0;
127 } 122 }
128 } 123 }
129 124
125 if (unlikely(est->history.count < est->history.window_size)) {
126 ++est->history.count;
127 }
128 else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
129 est->rsum -= est->history.range[est->history.ridx];
130 }
130 131
131 132 if (unlikely(est->history.count == 1)) {
132 if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) { 133 est->center_line = observed;
133 ++est->count;
134 } 134 }
135 else { 135 else {
136 est->sum -= est->history[est->idx]; 136 est->center_line =
137 ((est->alpha_num * est->center_line)/est->alpha_denom) +
138 (((est->alpha_denom - est->alpha_num) * observed)/est->alpha_denom);
137 } 139 }
138 140
139 mig_info.observed = observed; 141 if (tsk_rt(t)->prediction_mode == CC_BRUTE_EWMA) {
140 mig_info.estimated = est->avg; 142 est->history.observations[est->history.idx] = observed;
141 mig_info.distance = tsk_rt(t)->gpu_migration; 143 est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
142 sched_trace_migration(t, &mig_info); 144 est->history.idx = (est->history.idx + 1) % est->history.window_size;
143
144
145 est->history[est->idx] = observed;
146 est->sum += observed;
147 est->avg = est->sum/est->count;
148 est->std = isqrt(varience(est->history, est->avg, est->count));
149 est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
150
151
152#if 0
153 if(unlikely(fb->est.val == 0)) {
154 // kludge-- cap observed values to prevent whacky estimations.
155 // whacky stuff happens during the first few jobs.
156 if(unlikely(observed > OBSERVATION_CAP)) {
157 TRACE_TASK(t, "Crazy observation was capped: %llu -> %llu\n",
158 observed, OBSERVATION_CAP);
159 observed = OBSERVATION_CAP;
160 }
161
162 // take the first observation as our estimate
163 // (initial value of 0 was bogus anyhow)
164 fb->est = _integer_to_fp(observed);
165 fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work.
166 } 145 }
167 else { 146 else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
168 fp_t rel_err = update_estimate(fb, 147 if (likely(est->history.count > 1)) {
169 tsk_rt(t)->gpu_fb_param_a[tsk_rt(t)->gpu_migration], 148 lt_t mr;
170 tsk_rt(t)->gpu_fb_param_b[tsk_rt(t)->gpu_migration], 149 est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
171 observed); 150 observed - est->history.last_observed :
172 151 est->history.last_observed - observed;
173 if(unlikely(_fp_to_integer(fb->est) <= 0)) { 152 est->rsum += est->history.range[est->history.ridx];
174 TRACE_TASK(t, "Invalid estimate. Patching.\n"); 153 mr = est->rsum / (est->history.count - 1);
175 fb->est = _integer_to_fp(observed); 154 est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
176 fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work. 155 est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
177 } 156 }
178 else { 157 est->history.last_observed = observed;
179 struct migration_info mig_info; 158 }
180 159
181 sched_trace_prediction_err(t, 160 return est->center_line;
182 &(tsk_rt(t)->gpu_migration), 161}
183 &rel_err);
184 162
185 mig_info.observed = observed; 163void update_gpu_estimate(struct task_struct *t, lt_t observed)
186 mig_info.estimated = get_gpu_estimate(t, tsk_rt(t)->gpu_migration); 164{
187 mig_info.distance = tsk_rt(t)->gpu_migration; 165 lt_t new_est;
166 struct migration_info mig_info;
188 167
189 sched_trace_migration(t, &mig_info); 168 BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
190 } 169
170 if (unlikely(observed > OBSERVATION_CAP)) {
171 TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
172 observed,
173 OBSERVATION_CAP);
174 return;
191 } 175 }
192#endif 176
177 switch (tsk_rt(t)->prediction_mode) {
178 case SIMPLE_AVG:
179 case CC_BRUTE_AVG:
180 case CC_MR_AVG:
181 new_est = avg_method(t, observed);
182 break;
183 case SIMPLE_EWMA:
184 case CC_BRUTE_EWMA:
185 case CC_MR_EWMA:
186 new_est = ewma_method(t, observed);
187 break;
188 default:
189 BUG();
190 break;
191 }
192
193 mig_info.observed = observed;
194 mig_info.estimated = new_est;
195 mig_info.distance = tsk_rt(t)->gpu_migration;
196 sched_trace_migration(t, &mig_info);
193 197
194 TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n", 198 TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
195 tsk_rt(t)->gpu_migration, 199 tsk_rt(t)->gpu_migration,
196 observed, 200 observed,
197 est->avg); 201 new_est);
198} 202}
199 203
200gpu_migration_dist_t gpu_migration_distance(int a, int b) 204gpu_migration_dist_t gpu_migration_distance(int a, int b)