aboutsummaryrefslogtreecommitdiffstats
path: root/litmus/gpu_affinity.c
diff options
context:
space:
mode:
Diffstat (limited to 'litmus/gpu_affinity.c')
-rw-r--r--litmus/gpu_affinity.c231
1 files changed, 231 insertions, 0 deletions
diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c
new file mode 100644
index 000000000000..7d73105b4181
--- /dev/null
+++ b/litmus/gpu_affinity.c
@@ -0,0 +1,231 @@
1
2#ifdef CONFIG_LITMUS_NVIDIA
3
4#include <linux/sched.h>
5#include <litmus/litmus.h>
6#include <litmus/gpu_affinity.h>
7
8#include <litmus/sched_trace.h>
9
10#define OBSERVATION_CAP ((lt_t)(2e9))
11
12// reason for skew: high outliers are less
13// frequent and way out of bounds
14//#define HI_THRESHOLD 2
15//#define LO_THRESHOLD 4
16
17#define NUM_STDEV_NUM 1
18#define NUM_STDEV_DENOM 2
19
20#define MIN(a, b) ((a < b) ? a : b)
21
22static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
23{
24 fp_t relative_err;
25 fp_t err, new;
26 fp_t actual = _integer_to_fp(observed);
27
28 err = _sub(actual, fb->est);
29 new = _add(_mul(a, err), _mul(b, fb->accum_err));
30
31 relative_err = _div(err, actual);
32
33 fb->est = new;
34 fb->accum_err = _add(fb->accum_err, err);
35
36 return relative_err;
37}
38
39lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)
40{
41 /* brute force: takes about as much time as incremental running methods when
42 * count < 50 (on Bonham). Brute force also less prone to overflow.
43 */
44 lt_t sqdeviations = 0;
45 uint16_t i;
46 for(i = 0; i < count; ++i)
47 {
48 lt_t temp = (int64_t)nums[i] - (int64_t)avg;
49 sqdeviations += temp * temp;
50 }
51 return sqdeviations/count;
52}
53
54lt_t isqrt(lt_t n)
55{
56 /* integer square root using babylonian method
57 * (algo taken from wikipedia */
58 lt_t res = 0;
59 lt_t bit = ((lt_t)1) << (sizeof(n)*8-2);
60 while (bit > n) {
61 bit >>= 2;
62 }
63
64 while (bit != 0) {
65 if (n >= res + bit) {
66 n -= res + bit;
67 res = (res >> 1) + bit;
68 }
69 else {
70 res >>= 1;
71 }
72 bit >>= 2;
73 }
74 return res;
75}
76
77void update_gpu_estimate(struct task_struct *t, lt_t observed)
78{
79 //feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
80 avg_est_t *est;
81 struct migration_info mig_info;
82
83 BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
84
85 est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
86
87 if (unlikely(observed > OBSERVATION_CAP)) {
88 TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
89 observed,
90 OBSERVATION_CAP);
91 return;
92 }
93
94#if 0
95 // filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out
96 // of range of the average, but only filter if enough samples
97 // have been taken.
98 if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
99 if (unlikely(observed < est->avg/LO_THRESHOLD)) {
100 TRACE_TASK(t, "Observation is too small: %llu\n",
101 observed);
102 return;
103 }
104 else if (unlikely(observed > est->avg*HI_THRESHOLD)) {
105 TRACE_TASK(t, "Observation is too large: %llu\n",
106 observed);
107 return;
108 }
109#endif
110 // filter values outside NUM_STDEVx the standard deviation,
111 // but only filter if enough samples have been taken.
112 if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
113 lt_t lower, upper;
114
115 lt_t range = (est->std*NUM_STDEV_NUM)/NUM_STDEV_DENOM;
116 lower = est->avg - MIN(range, est->avg); // no underflow.
117
118 if (unlikely(observed < lower)) {
119 TRACE_TASK(t, "Observation is too small: %llu\n", observed);
120 return;
121 }
122
123 upper = est->avg + range;
124 if (unlikely(observed > upper)) {
125 TRACE_TASK(t, "Observation is too large: %llu\n", observed);
126 return;
127 }
128 }
129
130
131
132 if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {
133 ++est->count;
134 }
135 else {
136 est->sum -= est->history[est->idx];
137 }
138
139 mig_info.observed = observed;
140 mig_info.estimated = est->avg;
141 mig_info.distance = tsk_rt(t)->gpu_migration;
142 sched_trace_migration(t, &mig_info);
143
144
145 est->history[est->idx] = observed;
146 est->sum += observed;
147 est->avg = est->sum/est->count;
148 est->std = isqrt(varience(est->history, est->avg, est->count));
149 est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
150
151
152#if 0
153 if(unlikely(fb->est.val == 0)) {
154 // kludge-- cap observed values to prevent whacky estimations.
155 // whacky stuff happens during the first few jobs.
156 if(unlikely(observed > OBSERVATION_CAP)) {
157 TRACE_TASK(t, "Crazy observation was capped: %llu -> %llu\n",
158 observed, OBSERVATION_CAP);
159 observed = OBSERVATION_CAP;
160 }
161
162 // take the first observation as our estimate
163 // (initial value of 0 was bogus anyhow)
164 fb->est = _integer_to_fp(observed);
165 fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work.
166 }
167 else {
168 fp_t rel_err = update_estimate(fb,
169 tsk_rt(t)->gpu_fb_param_a[tsk_rt(t)->gpu_migration],
170 tsk_rt(t)->gpu_fb_param_b[tsk_rt(t)->gpu_migration],
171 observed);
172
173 if(unlikely(_fp_to_integer(fb->est) <= 0)) {
174 TRACE_TASK(t, "Invalid estimate. Patching.\n");
175 fb->est = _integer_to_fp(observed);
176 fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work.
177 }
178 else {
179 struct migration_info mig_info;
180
181 sched_trace_prediction_err(t,
182 &(tsk_rt(t)->gpu_migration),
183 &rel_err);
184
185 mig_info.observed = observed;
186 mig_info.estimated = get_gpu_estimate(t, tsk_rt(t)->gpu_migration);
187 mig_info.distance = tsk_rt(t)->gpu_migration;
188
189 sched_trace_migration(t, &mig_info);
190 }
191 }
192#endif
193
194 TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
195 tsk_rt(t)->gpu_migration,
196 observed,
197 est->avg);
198}
199
200gpu_migration_dist_t gpu_migration_distance(int a, int b)
201{
202 // GPUs organized in a binary hierarchy, no more than 2^MIG_FAR GPUs
203 int i;
204 int dist;
205
206 if(likely(a >= 0 && b >= 0)) {
207 for(i = 0; i <= MIG_FAR; ++i) {
208 if(a>>i == b>>i) {
209 dist = i;
210 goto out;
211 }
212 }
213 dist = MIG_NONE; // hopefully never reached.
214 TRACE_CUR("WARNING: GPU distance too far! %d -> %d\n", a, b);
215 }
216 else {
217 dist = MIG_NONE;
218 }
219
220out:
221 TRACE_CUR("Distance %d -> %d is %d\n",
222 a, b, dist);
223
224 return dist;
225}
226
227
228
229
230#endif
231