1 files changed, 121 insertions, 117 deletions
diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c
index 7d73105b4181..7a2b5005be45 100644
--- a/litmus/gpu_affinity.c
+++ b/litmus/gpu_affinity.c
@@ -9,33 +9,10 @@
 #define OBSERVATION_CAP ((lt_t)(2e9))
-// reason for skew: high outliers are less
+#define MIN_OBSERVATIONS 10
-// frequent and way out of bounds
-//#define HI_THRESHOLD 2
-//#define LO_THRESHOLD 4
-#define NUM_STDEV_NUM   1
-#define NUM_STDEV_DENOM 2
 #define MIN(a, b) ((a < b) ? a : b)
-static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
-{
-        fp_t relative_err;
-        fp_t err, new;
-        fp_t actual = _integer_to_fp(observed);
-        err = _sub(actual, fb->est);
-        new = _add(_mul(a, err), _mul(b, fb->accum_err));
-        relative_err = _div(err, actual);
-        fb->est = new;
-        fb->accum_err = _add(fb->accum_err, err);
-        return relative_err;
-}
 lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)
 {
        /* brute force: takes about as much time as incremental running methods when
@@ -74,127 +51,154 @@ lt_t isqrt(lt_t n)
        return res;
 }
-void update_gpu_estimate(struct task_struct *t, lt_t observed)
+static lt_t avg_method(struct task_struct *t, lt_t observed)
 {
-        //feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        avg_est_t *est = &(tsk_rt(t)->gpu_avg_est[tsk_rt(t)->gpu_migration]);
-        avg_est_t *est;
-        struct migration_info mig_info;
+        /* cut-off */
+        if (tsk_rt(t)->prediction_mode != SIMPLE_AVG &&
-        BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
+                                        est->history.count >= MIN_OBSERVATIONS) {
+                lt_t upper = est->center_line + est->std * est->sigmas;
+                lt_t lower = est->center_line - est->std * est->sigmas;
+                if (unlikely(observed < lower || observed > upper)) {
+                        TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
+                                observed, lower, upper);
+                        return 0;
+                }
+        }
-        est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        if (unlikely(est->history.count < est->history.window_size)) {
+                ++est->history.count;
+        }
+        else {
+                est->sum -= est->history.observations[est->history.idx];
-        if (unlikely(observed > OBSERVATION_CAP)) {
+                if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
-                TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
+                        est->rsum -= est->history.range[est->history.ridx];
-                        observed,
+                }
-                        OBSERVATION_CAP);
-                return;
        }
-#if 0
+        est->history.observations[est->history.idx] = observed;
-        // filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out
+        est->sum += observed;
-        // of range of the average, but only filter if enough samples
+        est->center_line = est->sum/est->history.count;
-        // have been taken.
-        if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
+        if (tsk_rt(t)->prediction_mode == CC_BRUTE_AVG) {
-                if (unlikely(observed < est->avg/LO_THRESHOLD)) {
+                if (likely(est->history.count > 1)) {
-                        TRACE_TASK(t, "Observation is too small: %llu\n",
+                        est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
-                                                        observed);
-                        return;
                }
-                else if (unlikely(observed > est->avg*HI_THRESHOLD)) {
+        }
-                        TRACE_TASK(t, "Observation is too large: %llu\n",
+        else if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
-                                                        observed);
+                if (likely(est->history.count > 1)) {
-                        return;
+                        lt_t mr;
+                        est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
+                                observed - est->history.last_observed :
+                                est->history.last_observed - observed;
+                        est->rsum += est->history.range[est->history.ridx];
+                        mr = est->rsum / (est->history.count - 1);
+                        est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
+                        est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
                }
-#endif
+                est->history.last_observed = observed;
-        // filter values outside NUM_STDEVx the standard deviation,
+        }
-        // but only filter if enough samples have been taken.
-        if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
-                lt_t lower, upper;
-                lt_t range = (est->std*NUM_STDEV_NUM)/NUM_STDEV_DENOM;
+        est->history.idx = (est->history.idx + 1) % est->history.window_size;
-                lower = est->avg - MIN(range, est->avg); // no underflow.
-                if (unlikely(observed < lower)) {
+        return est->center_line;
-                        TRACE_TASK(t, "Observation is too small: %llu\n", observed);
+}
-                        return;
-                }
-                upper = est->avg + range;
+static lt_t ewma_method(struct task_struct *t, lt_t observed)
-                if (unlikely(observed > upper)) {
+{
-                        TRACE_TASK(t, "Observation is too large: %llu\n", observed);
+        ewma_est_t *est = &(tsk_rt(t)->gpu_ewma_est[tsk_rt(t)->gpu_migration]);
-                        return;
+        /* cut-off */
+        if (tsk_rt(t)->prediction_mode != SIMPLE_EWMA &&
+                                        est->history.count >= MIN_OBSERVATIONS) {
+                lt_t upper = est->center_line + est->std * est->sigmas;
+                lt_t lower = est->center_line - est->std * est->sigmas;
+                if (unlikely(observed < lower || observed > upper)) {
+                        TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
+                                observed, lower, upper);
+                        return 0;
                }
        }
+        if (unlikely(est->history.count < est->history.window_size)) {
+                ++est->history.count;
+        }
+        else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
+                est->rsum -= est->history.range[est->history.ridx];
+        }
+        if (unlikely(est->history.count == 1)) {
-        if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {
+                est->center_line = observed;
-                ++est->count;
        }
        else {
-                est->sum -= est->history[est->idx];
+                est->center_line =
+                                ((est->alpha_num * est->center_line)/est->alpha_denom) +
+                                (((est->alpha_denom - est->alpha_num) * observed)/est->alpha_denom);
        }
-        mig_info.observed = observed;
+        if (tsk_rt(t)->prediction_mode == CC_BRUTE_EWMA) {
-        mig_info.estimated = est->avg;
+                est->history.observations[est->history.idx] = observed;
-        mig_info.distance = tsk_rt(t)->gpu_migration;
+                est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
-        sched_trace_migration(t, &mig_info);
+                est->history.idx = (est->history.idx + 1) % est->history.window_size;
-        est->history[est->idx] = observed;
-        est->sum += observed;
-        est->avg = est->sum/est->count;
-        est->std = isqrt(varience(est->history, est->avg, est->count));
-        est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
-#if 0
-        if(unlikely(fb->est.val == 0)) {
-                // kludge-- cap observed values to prevent whacky estimations.
-                // whacky stuff happens during the first few jobs.
-                if(unlikely(observed > OBSERVATION_CAP)) {
-                        TRACE_TASK(t, "Crazy observation was capped: %llu -> %llu\n",
-                                           observed, OBSERVATION_CAP);
-                        observed = OBSERVATION_CAP;
-                }
-                // take the first observation as our estimate
-                // (initial value of 0 was bogus anyhow)
-                fb->est = _integer_to_fp(observed);
-                fb->accum_err = _div(fb->est, _integer_to_fp(2));  // ...seems to work.
        }
-        else {
+        else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
-                fp_t rel_err = update_estimate(fb,
+                if (likely(est->history.count > 1)) {
-                                                                           tsk_rt(t)->gpu_fb_param_a[tsk_rt(t)->gpu_migration],
+                        lt_t mr;
-                                                                           tsk_rt(t)->gpu_fb_param_b[tsk_rt(t)->gpu_migration],
+                        est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
-                                                                           observed);
+                                observed - est->history.last_observed :
+                                est->history.last_observed - observed;
-                if(unlikely(_fp_to_integer(fb->est) <= 0)) {
+                        est->rsum += est->history.range[est->history.ridx];
-                        TRACE_TASK(t, "Invalid estimate. Patching.\n");
+                        mr = est->rsum / (est->history.count - 1);
-                        fb->est = _integer_to_fp(observed);
+                        est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
-                        fb->accum_err = _div(fb->est, _integer_to_fp(2));  // ...seems to work.
+                        est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
                }
-                else {
+                est->history.last_observed = observed;
-                        struct migration_info mig_info;
+        }
-                        sched_trace_prediction_err(t,
+        return est->center_line;
-                                                                           &(tsk_rt(t)->gpu_migration),
+}
-                                                                           &rel_err);
-                        mig_info.observed = observed;
+void update_gpu_estimate(struct task_struct *t, lt_t observed)
-                        mig_info.estimated = get_gpu_estimate(t, tsk_rt(t)->gpu_migration);
+{
-                        mig_info.distance = tsk_rt(t)->gpu_migration;
+        lt_t new_est;
+        struct migration_info mig_info;
-                        sched_trace_migration(t, &mig_info);
+        BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
-                }
+        if (unlikely(observed > OBSERVATION_CAP)) {
+                TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
+                        observed,
+                        OBSERVATION_CAP);
+                return;
        }
-#endif
+        switch (tsk_rt(t)->prediction_mode) {
+                case SIMPLE_AVG:
+                case CC_BRUTE_AVG:
+                case CC_MR_AVG:
+                        new_est = avg_method(t, observed);
+                        break;
+                case SIMPLE_EWMA:
+                case CC_BRUTE_EWMA:
+                case CC_MR_EWMA:
+                        new_est = ewma_method(t, observed);
+                        break;
+                default:
+                        BUG();
+                        break;
+        }
+        mig_info.observed = observed;
+        mig_info.estimated = new_est;
+        mig_info.distance = tsk_rt(t)->gpu_migration;
+        sched_trace_migration(t, &mig_info);
        TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
                           tsk_rt(t)->gpu_migration,
                           observed,
-                           est->avg);
+                           new_est);
 }
 gpu_migration_dist_t gpu_migration_distance(int a, int b)

diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c index 7d73105b4181..7a2b5005be45 100644 --- a/litmus/gpu_affinity.c +++ b/litmus/gpu_affinity.c
@@ -9,33 +9,10 @@
9		9
10	#define OBSERVATION_CAP ((lt_t)(2e9))	10	#define OBSERVATION_CAP ((lt_t)(2e9))
11		11
12	// reason for skew: high outliers are less	12	#define MIN_OBSERVATIONS 10
13	// frequent and way out of bounds
14	//#define HI_THRESHOLD 2
15	//#define LO_THRESHOLD 4
16
17	#define NUM_STDEV_NUM 1
18	#define NUM_STDEV_DENOM 2
19		13
20	#define MIN(a, b) ((a < b) ? a : b)	14	#define MIN(a, b) ((a < b) ? a : b)
21		15
22	static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
23	{
24	fp_t relative_err;
25	fp_t err, new;
26	fp_t actual = _integer_to_fp(observed);
27
28	err = _sub(actual, fb->est);
29	new = _add(_mul(a, err), _mul(b, fb->accum_err));
30
31	relative_err = _div(err, actual);
32
33	fb->est = new;
34	fb->accum_err = _add(fb->accum_err, err);
35
36	return relative_err;
37	}
38
39	lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)	16	lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)
40	{	17	{
41	/* brute force: takes about as much time as incremental running methods when	18	/* brute force: takes about as much time as incremental running methods when
@@ -74,127 +51,154 @@ lt_t isqrt(lt_t n)
74	return res;	51	return res;
75	}	52	}
76		53
77	void update_gpu_estimate(struct task_struct *t, lt_t observed)	54	static lt_t avg_method(struct task_struct *t, lt_t observed)
78	{	55	{
79	//feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);	56	avg_est_t *est = &(tsk_rt(t)->gpu_avg_est[tsk_rt(t)->gpu_migration]);
80	avg_est_t *est;	57
81	struct migration_info mig_info;	58	/* cut-off */
82		59	if (tsk_rt(t)->prediction_mode != SIMPLE_AVG &&
83	BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);	60	est->history.count >= MIN_OBSERVATIONS) {
		61	lt_t upper = est->center_line + est->std * est->sigmas;
		62	lt_t lower = est->center_line - est->std * est->sigmas;
		63	if (unlikely(observed < lower \|\| observed > upper)) {
		64	TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
		65	observed, lower, upper);
		66	return 0;
		67	}
		68	}
84		69
85	est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);	70	if (unlikely(est->history.count < est->history.window_size)) {
		71	++est->history.count;
		72	}
		73	else {
		74	est->sum -= est->history.observations[est->history.idx];
86		75
87	if (unlikely(observed > OBSERVATION_CAP)) {	76	if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
88	TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",	77	est->rsum -= est->history.range[est->history.ridx];
89	observed,	78	}
90	OBSERVATION_CAP);
91	return;
92	}	79	}
93		80
94	#if 0	81	est->history.observations[est->history.idx] = observed;
95	// filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out	82	est->sum += observed;
96	// of range of the average, but only filter if enough samples	83	est->center_line = est->sum/est->history.count;
97	// have been taken.	84
98	if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {	85	if (tsk_rt(t)->prediction_mode == CC_BRUTE_AVG) {
99	if (unlikely(observed < est->avg/LO_THRESHOLD)) {	86	if (likely(est->history.count > 1)) {
100	TRACE_TASK(t, "Observation is too small: %llu\n",	87	est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
101	observed);
102	return;
103	}	88	}
104	else if (unlikely(observed > est->avg*HI_THRESHOLD)) {	89	}
105	TRACE_TASK(t, "Observation is too large: %llu\n",	90	else if (tsk_rt(t)->prediction_mode == CC_MR_AVG) {
106	observed);	91	if (likely(est->history.count > 1)) {
107	return;	92	lt_t mr;
		93	est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
		94	observed - est->history.last_observed :
		95	est->history.last_observed - observed;
		96	est->rsum += est->history.range[est->history.ridx];
		97	mr = est->rsum / (est->history.count - 1);
		98	est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
		99	est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
108	}	100	}
109	#endif	101	est->history.last_observed = observed;
110	// filter values outside NUM_STDEVx the standard deviation,	102	}
111	// but only filter if enough samples have been taken.
112	if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
113	lt_t lower, upper;
114		103
115	lt_t range = (est->std*NUM_STDEV_NUM)/NUM_STDEV_DENOM;	104	est->history.idx = (est->history.idx + 1) % est->history.window_size;
116	lower = est->avg - MIN(range, est->avg); // no underflow.
117		105
118	if (unlikely(observed < lower)) {	106	return est->center_line;
119	TRACE_TASK(t, "Observation is too small: %llu\n", observed);	107	}
120	return;
121	}
122		108
123	upper = est->avg + range;	109	static lt_t ewma_method(struct task_struct *t, lt_t observed)
124	if (unlikely(observed > upper)) {	110	{
125	TRACE_TASK(t, "Observation is too large: %llu\n", observed);	111	ewma_est_t *est = &(tsk_rt(t)->gpu_ewma_est[tsk_rt(t)->gpu_migration]);
126	return;	112
		113	/* cut-off */
		114	if (tsk_rt(t)->prediction_mode != SIMPLE_EWMA &&
		115	est->history.count >= MIN_OBSERVATIONS) {
		116	lt_t upper = est->center_line + est->std * est->sigmas;
		117	lt_t lower = est->center_line - est->std * est->sigmas;
		118	if (unlikely(observed < lower \|\| observed > upper)) {
		119	TRACE_TASK(t, "%llu out of range [%llu, %llu]\n",
		120	observed, lower, upper);
		121	return 0;
127	}	122	}
128	}	123	}
129		124
		125	if (unlikely(est->history.count < est->history.window_size)) {
		126	++est->history.count;
		127	}
		128	else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
		129	est->rsum -= est->history.range[est->history.ridx];
		130	}
130		131
131		132	if (unlikely(est->history.count == 1)) {
132	if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {	133	est->center_line = observed;
133	++est->count;
134	}	134	}
135	else {	135	else {
136	est->sum -= est->history[est->idx];	136	est->center_line =
		137	((est->alpha_num * est->center_line)/est->alpha_denom) +
		138	(((est->alpha_denom - est->alpha_num) * observed)/est->alpha_denom);
137	}	139	}
138		140
139	mig_info.observed = observed;	141	if (tsk_rt(t)->prediction_mode == CC_BRUTE_EWMA) {
140	mig_info.estimated = est->avg;	142	est->history.observations[est->history.idx] = observed;
141	mig_info.distance = tsk_rt(t)->gpu_migration;	143	est->std = isqrt(varience(est->history.observations, est->center_line, est->history.count));
142	sched_trace_migration(t, &mig_info);	144	est->history.idx = (est->history.idx + 1) % est->history.window_size;
143
144
145	est->history[est->idx] = observed;
146	est->sum += observed;
147	est->avg = est->sum/est->count;
148	est->std = isqrt(varience(est->history, est->avg, est->count));
149	est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
150
151
152	#if 0
153	if(unlikely(fb->est.val == 0)) {
154	// kludge-- cap observed values to prevent whacky estimations.
155	// whacky stuff happens during the first few jobs.
156	if(unlikely(observed > OBSERVATION_CAP)) {
157	TRACE_TASK(t, "Crazy observation was capped: %llu -> %llu\n",
158	observed, OBSERVATION_CAP);
159	observed = OBSERVATION_CAP;
160	}
161
162	// take the first observation as our estimate
163	// (initial value of 0 was bogus anyhow)
164	fb->est = _integer_to_fp(observed);
165	fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work.
166	}	145	}
167	else {	146	else if (tsk_rt(t)->prediction_mode == CC_MR_EWMA) {
168	fp_t rel_err = update_estimate(fb,	147	if (likely(est->history.count > 1)) {
169	tsk_rt(t)->gpu_fb_param_a[tsk_rt(t)->gpu_migration],	148	lt_t mr;
170	tsk_rt(t)->gpu_fb_param_b[tsk_rt(t)->gpu_migration],	149	est->history.range[est->history.ridx] = (observed > est->history.last_observed) ?
171	observed);	150	observed - est->history.last_observed :
172		151	est->history.last_observed - observed;
173	if(unlikely(_fp_to_integer(fb->est) <= 0)) {	152	est->rsum += est->history.range[est->history.ridx];
174	TRACE_TASK(t, "Invalid estimate. Patching.\n");	153	mr = est->rsum / (est->history.count - 1);
175	fb->est = _integer_to_fp(observed);	154	est->std = est->rsum + (128*est->rsum)/1000; // std = mr/d_2, w/ d_2 = 1.128
176	fb->accum_err = _div(fb->est, _integer_to_fp(2)); // ...seems to work.	155	est->history.ridx = (est->history.ridx + 1) % (est->history.window_size - 1);
177	}	156	}
178	else {	157	est->history.last_observed = observed;
179	struct migration_info mig_info;	158	}
180		159
181	sched_trace_prediction_err(t,	160	return est->center_line;
182	&(tsk_rt(t)->gpu_migration),	161	}
183	&rel_err);
184		162
185	mig_info.observed = observed;	163	void update_gpu_estimate(struct task_struct *t, lt_t observed)
186	mig_info.estimated = get_gpu_estimate(t, tsk_rt(t)->gpu_migration);	164	{
187	mig_info.distance = tsk_rt(t)->gpu_migration;	165	lt_t new_est;
		166	struct migration_info mig_info;
188		167
189	sched_trace_migration(t, &mig_info);	168	BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
190	}	169
		170	if (unlikely(observed > OBSERVATION_CAP)) {
		171	TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
		172	observed,
		173	OBSERVATION_CAP);
		174	return;
191	}	175	}
192	#endif	176
		177	switch (tsk_rt(t)->prediction_mode) {
		178	case SIMPLE_AVG:
		179	case CC_BRUTE_AVG:
		180	case CC_MR_AVG:
		181	new_est = avg_method(t, observed);
		182	break;
		183	case SIMPLE_EWMA:
		184	case CC_BRUTE_EWMA:
		185	case CC_MR_EWMA:
		186	new_est = ewma_method(t, observed);
		187	break;
		188	default:
		189	BUG();
		190	break;
		191	}
		192
		193	mig_info.observed = observed;
		194	mig_info.estimated = new_est;
		195	mig_info.distance = tsk_rt(t)->gpu_migration;
		196	sched_trace_migration(t, &mig_info);
193		197
194	TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",	198	TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
195	tsk_rt(t)->gpu_migration,	199	tsk_rt(t)->gpu_migration,
196	observed,	200	observed,
197	est->avg);	201	new_est);
198	}	202	}
199		203
200	gpu_migration_dist_t gpu_migration_distance(int a, int b)	204	gpu_migration_dist_t gpu_migration_distance(int a, int b)