simple average tracking

author: Glenn Elliott <gelliott@cs.unc.edu> 2012-09-09 13:42:13 -0400
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-09-09 13:42:13 -0400
commit: 193a19c94a32f2e2a0e973f0a98cf4a098cefa15 (patch)
tree: 19c501d521fd7a265a54e7006aa331612aa3246e
parent: 00c173dc87b14b8422cea2aa129a2fc99689a05d (diff)
4 files changed, 83 insertions, 16 deletions
diff --git a/include/litmus/gpu_affinity.h b/include/litmus/gpu_affinity.h
index 6b3fb8b28745..d64a15cbf2a5 100644
--- a/include/litmus/gpu_affinity.h
+++ b/include/litmus/gpu_affinity.h
@@ -31,17 +31,18 @@ static inline lt_t get_gpu_time(struct task_struct* t)
 static inline lt_t get_gpu_estimate(struct task_struct* t, gpu_migration_dist_t dist)
 {
-        int i;
+//      int i;
-        fpbuf_t temp = _fp_to_integer(t->rt_param.gpu_migration_est[dist].est);
+//      fpbuf_t temp = _fp_to_integer(t->rt_param.gpu_migration_est[dist].est);
-        lt_t val = (temp >= 0) ? temp : 0;  // never allow negative estimates...
+//      lt_t val = (temp >= 0) ? temp : 0;  // never allow negative estimates...
+        lt_t val = t->rt_param.gpu_migration_est[dist].avg;
-        WARN_ON(temp < 0);
+//      WARN_ON(temp < 0);
        // lower-bound a distant migration to be at least equal to the level
        // below it.
-        for(i = dist-1; (val == 0) && (i >= MIG_LOCAL); --i) {
+//      for(i = dist-1; (val == 0) && (i >= MIG_LOCAL); --i) {
-                val = _fp_to_integer(t->rt_param.gpu_migration_est[i].est);
+//              val = _fp_to_integer(t->rt_param.gpu_migration_est[i].est);
-        }
+//      }
        return ((val > 0) ? val : dist+1);
 }
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index 0198884eab86..a441badd30cc 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -144,6 +144,17 @@ typedef struct feedback_est{
        fp_t accum_err;
 } feedback_est_t;
+#define AVG_EST_WINDOW_SIZE 20
+typedef struct avg_est{
+        lt_t history[AVG_EST_WINDOW_SIZE];
+        uint16_t count;
+        uint16_t idx;
+        lt_t sum;
+        lt_t avg;
+} avg_est_t;
 /*      RT task parameters for scheduling extensions
 *      These parameters are inherited during clone and therefore must
 *      be explicitly set up before the task set is launched.
@@ -190,12 +201,10 @@ struct rt_param {
        long unsigned int                       held_gpus;  // bitmap of held GPUs.
 #ifdef CONFIG_LITMUS_AFFINITY_LOCKING
-        fp_t    gpu_fb_param_a[MIG_LAST+1];
+        avg_est_t gpu_migration_est[MIG_LAST+1];
-        fp_t    gpu_fb_param_b[MIG_LAST+1];
        gpu_migration_dist_t    gpu_migration;
        int                             last_gpu;
-        feedback_est_t  gpu_migration_est[MIG_LAST+1]; // local, near, med, far
        lt_t accum_gpu_time;
        lt_t gpu_time_stamp;
diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c
index 55bb5e1128ec..2cdf18bc7dd6 100644
--- a/litmus/gpu_affinity.c
+++ b/litmus/gpu_affinity.c
@@ -7,7 +7,14 @@
 #include <litmus/sched_trace.h>
-#define OBSERVATION_CAP 2*1e9
+#define OBSERVATION_CAP ((lt_t)(2e9))
+// reason for skew: high outliers are less
+// frequent and way out of bounds
+#define HI_THRESHOLD 2
+#define LO_THRESHOLD 4
+#define MIN(a, b) ((a < b) ? a : b)
 static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
 {
@@ -28,10 +35,59 @@ static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
 void update_gpu_estimate(struct task_struct *t, lt_t observed)
 {
-        feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        //feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        avg_est_t *est;
+        struct migration_info mig_info;
        BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
+        est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        if (unlikely(observed > OBSERVATION_CAP)) {
+                TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
+                        observed,
+                        OBSERVATION_CAP);
+                return;
+        }
+#if 0
+        // filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out
+        // of range of the average, but only filter if enough samples
+        // have been taken.
+        if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
+                if (unlikely(observed < est->avg/LO_THRESHOLD)) {
+                        TRACE_TASK(t, "Observation is too small: %llu\n",
+                                                        observed);
+                        return;
+                }
+                else if (unlikely(observed > est->avg*HI_THRESHOLD)) {
+                        TRACE_TASK(t, "Observation is too large: %llu\n",
+                                                        observed);
+                        return;
+                }
+        }
+#endif
+        if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {
+                ++est->count;
+        }
+        else {
+                est->sum -= est->history[est->idx];
+        }
+        mig_info.observed = observed;
+        mig_info.estimated = est->avg;
+        mig_info.distance = tsk_rt(t)->gpu_migration;
+        sched_trace_migration(t, &mig_info);
+        est->history[est->idx] = observed;
+        est->sum += observed;
+        est->avg = est->sum/est->count;
+        est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
+#if 0
        if(unlikely(fb->est.val == 0)) {
                // kludge-- cap observed values to prevent whacky estimations.
                // whacky stuff happens during the first few jobs.
@@ -71,12 +127,12 @@ void update_gpu_estimate(struct task_struct *t, lt_t observed)
                        sched_trace_migration(t, &mig_info);
                }
        }
+#endif
-        TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %d.%d\n",
+        TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
                           tsk_rt(t)->gpu_migration,
                           observed,
-                           _fp_to_integer(fb->est),
+                           est->avg);
-                           _point(fb->est));
 }
 gpu_migration_dist_t gpu_migration_distance(int a, int b)
diff --git a/litmus/litmus.c b/litmus/litmus.c
index d1f836c8af6e..91ec65894379 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -321,6 +321,7 @@ void init_gpu_affinity_state(struct task_struct* p)
        //p->rt_param.gpu_fb_param_a = _frac(14008, 10000);
        //p->rt_param.gpu_fb_param_b = _frac(16024, 10000);
+#if 0           
        // emperical;
        p->rt_param.gpu_fb_param_a[0] = _frac(7550, 10000);
        p->rt_param.gpu_fb_param_b[0] = _frac(45800, 10000);
@@ -333,7 +334,7 @@ void init_gpu_affinity_state(struct task_struct* p)
        p->rt_param.gpu_fb_param_a[3] = _frac(7580, 10000);
        p->rt_param.gpu_fb_param_b[3] = _frac(34590, 10000);
+#endif
        p->rt_param.gpu_migration = MIG_NONE;
        p->rt_param.last_gpu = -1;
 }
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-09-09 13:42:13 -0400
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-09-09 13:42:13 -0400
commit	193a19c94a32f2e2a0e973f0a98cf4a098cefa15 (patch)
tree	19c501d521fd7a265a54e7006aa331612aa3246e
parent	00c173dc87b14b8422cea2aa129a2fc99689a05d (diff)

diff --git a/include/litmus/gpu_affinity.h b/include/litmus/gpu_affinity.h index 6b3fb8b28745..d64a15cbf2a5 100644 --- a/include/litmus/gpu_affinity.h +++ b/include/litmus/gpu_affinity.h
@@ -31,17 +31,18 @@ static inline lt_t get_gpu_time(struct task_struct* t)
31		31
32	static inline lt_t get_gpu_estimate(struct task_struct* t, gpu_migration_dist_t dist)	32	static inline lt_t get_gpu_estimate(struct task_struct* t, gpu_migration_dist_t dist)
33	{	33	{
34	int i;	34	// int i;
35	fpbuf_t temp = _fp_to_integer(t->rt_param.gpu_migration_est[dist].est);	35	// fpbuf_t temp = _fp_to_integer(t->rt_param.gpu_migration_est[dist].est);
36	lt_t val = (temp >= 0) ? temp : 0; // never allow negative estimates...	36	// lt_t val = (temp >= 0) ? temp : 0; // never allow negative estimates...
		37	lt_t val = t->rt_param.gpu_migration_est[dist].avg;
37		38
38	WARN_ON(temp < 0);	39	// WARN_ON(temp < 0);
39		40
40	// lower-bound a distant migration to be at least equal to the level	41	// lower-bound a distant migration to be at least equal to the level
41	// below it.	42	// below it.
42	for(i = dist-1; (val == 0) && (i >= MIG_LOCAL); --i) {	43	// for(i = dist-1; (val == 0) && (i >= MIG_LOCAL); --i) {
43	val = _fp_to_integer(t->rt_param.gpu_migration_est[i].est);	44	// val = _fp_to_integer(t->rt_param.gpu_migration_est[i].est);
44	}	45	// }
45		46
46	return ((val > 0) ? val : dist+1);	47	return ((val > 0) ? val : dist+1);
47	}	48	}


diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h index 0198884eab86..a441badd30cc 100644 --- a/include/litmus/rt_param.h +++ b/include/litmus/rt_param.h
@@ -144,6 +144,17 @@ typedef struct feedback_est{
144	fp_t accum_err;	144	fp_t accum_err;
145	} feedback_est_t;	145	} feedback_est_t;
146		146
		147
		148	#define AVG_EST_WINDOW_SIZE 20
		149
		150	typedef struct avg_est{
		151	lt_t history[AVG_EST_WINDOW_SIZE];
		152	uint16_t count;
		153	uint16_t idx;
		154	lt_t sum;
		155	lt_t avg;
		156	} avg_est_t;
		157
147	/* RT task parameters for scheduling extensions	158	/* RT task parameters for scheduling extensions
148	* These parameters are inherited during clone and therefore must	159	* These parameters are inherited during clone and therefore must
149	* be explicitly set up before the task set is launched.	160	* be explicitly set up before the task set is launched.
@@ -190,12 +201,10 @@ struct rt_param {
190	long unsigned int held_gpus; // bitmap of held GPUs.	201	long unsigned int held_gpus; // bitmap of held GPUs.
191		202
192	#ifdef CONFIG_LITMUS_AFFINITY_LOCKING	203	#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
193	fp_t gpu_fb_param_a[MIG_LAST+1];	204	avg_est_t gpu_migration_est[MIG_LAST+1];
194	fp_t gpu_fb_param_b[MIG_LAST+1];
195		205
196	gpu_migration_dist_t gpu_migration;	206	gpu_migration_dist_t gpu_migration;
197	int last_gpu;	207	int last_gpu;
198	feedback_est_t gpu_migration_est[MIG_LAST+1]; // local, near, med, far
199		208
200	lt_t accum_gpu_time;	209	lt_t accum_gpu_time;
201	lt_t gpu_time_stamp;	210	lt_t gpu_time_stamp;


diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c index 55bb5e1128ec..2cdf18bc7dd6 100644 --- a/litmus/gpu_affinity.c +++ b/litmus/gpu_affinity.c
@@ -7,7 +7,14 @@
7		7
8	#include <litmus/sched_trace.h>	8	#include <litmus/sched_trace.h>
9		9
10	#define OBSERVATION_CAP 2*1e9	10	#define OBSERVATION_CAP ((lt_t)(2e9))
		11
		12	// reason for skew: high outliers are less
		13	// frequent and way out of bounds
		14	#define HI_THRESHOLD 2
		15	#define LO_THRESHOLD 4
		16
		17	#define MIN(a, b) ((a < b) ? a : b)
11		18
12	static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)	19	static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
13	{	20	{
@@ -28,10 +35,59 @@ static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
28		35
29	void update_gpu_estimate(struct task_struct *t, lt_t observed)	36	void update_gpu_estimate(struct task_struct *t, lt_t observed)
30	{	37	{
31	feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);	38	//feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
		39	avg_est_t *est;
		40	struct migration_info mig_info;
32		41
33	BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);	42	BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
34		43
		44	est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
		45
		46	if (unlikely(observed > OBSERVATION_CAP)) {
		47	TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
		48	observed,
		49	OBSERVATION_CAP);
		50	return;
		51	}
		52
		53	#if 0
		54	// filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out
		55	// of range of the average, but only filter if enough samples
		56	// have been taken.
		57	if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
		58	if (unlikely(observed < est->avg/LO_THRESHOLD)) {
		59	TRACE_TASK(t, "Observation is too small: %llu\n",
		60	observed);
		61	return;
		62	}
		63	else if (unlikely(observed > est->avg*HI_THRESHOLD)) {
		64	TRACE_TASK(t, "Observation is too large: %llu\n",
		65	observed);
		66	return;
		67	}
		68	}
		69	#endif
		70
		71	if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {
		72	++est->count;
		73	}
		74	else {
		75	est->sum -= est->history[est->idx];
		76	}
		77
		78	mig_info.observed = observed;
		79	mig_info.estimated = est->avg;
		80	mig_info.distance = tsk_rt(t)->gpu_migration;
		81	sched_trace_migration(t, &mig_info);
		82
		83
		84	est->history[est->idx] = observed;
		85	est->sum += observed;
		86	est->avg = est->sum/est->count;
		87	est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
		88
		89
		90	#if 0
35	if(unlikely(fb->est.val == 0)) {	91	if(unlikely(fb->est.val == 0)) {
36	// kludge-- cap observed values to prevent whacky estimations.	92	// kludge-- cap observed values to prevent whacky estimations.
37	// whacky stuff happens during the first few jobs.	93	// whacky stuff happens during the first few jobs.
@@ -71,12 +127,12 @@ void update_gpu_estimate(struct task_struct *t, lt_t observed)
71	sched_trace_migration(t, &mig_info);	127	sched_trace_migration(t, &mig_info);
72	}	128	}
73	}	129	}
		130	#endif
74		131
75	TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %d.%d\n",	132	TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
76	tsk_rt(t)->gpu_migration,	133	tsk_rt(t)->gpu_migration,
77	observed,	134	observed,
78	_fp_to_integer(fb->est),	135	est->avg);
79	_point(fb->est));
80	}	136	}
81		137
82	gpu_migration_dist_t gpu_migration_distance(int a, int b)	138	gpu_migration_dist_t gpu_migration_distance(int a, int b)


diff --git a/litmus/litmus.c b/litmus/litmus.c index d1f836c8af6e..91ec65894379 100644 --- a/litmus/litmus.c +++ b/litmus/litmus.c
@@ -321,6 +321,7 @@ void init_gpu_affinity_state(struct task_struct* p)
321	//p->rt_param.gpu_fb_param_a = _frac(14008, 10000);	321	//p->rt_param.gpu_fb_param_a = _frac(14008, 10000);
322	//p->rt_param.gpu_fb_param_b = _frac(16024, 10000);	322	//p->rt_param.gpu_fb_param_b = _frac(16024, 10000);
323		323
		324	#if 0
324	// emperical;	325	// emperical;
325	p->rt_param.gpu_fb_param_a[0] = _frac(7550, 10000);	326	p->rt_param.gpu_fb_param_a[0] = _frac(7550, 10000);
326	p->rt_param.gpu_fb_param_b[0] = _frac(45800, 10000);	327	p->rt_param.gpu_fb_param_b[0] = _frac(45800, 10000);
@@ -333,7 +334,7 @@ void init_gpu_affinity_state(struct task_struct* p)
333		334
334	p->rt_param.gpu_fb_param_a[3] = _frac(7580, 10000);	335	p->rt_param.gpu_fb_param_a[3] = _frac(7580, 10000);
335	p->rt_param.gpu_fb_param_b[3] = _frac(34590, 10000);	336	p->rt_param.gpu_fb_param_b[3] = _frac(34590, 10000);
336		337	#endif
337	p->rt_param.gpu_migration = MIG_NONE;	338	p->rt_param.gpu_migration = MIG_NONE;
338	p->rt_param.last_gpu = -1;	339	p->rt_param.last_gpu = -1;
339	}	340	}