sched: fair-group: SMP-nice for group scheduling

Implement SMP nice support for the full group hierarchy. On each load-balance action, compile a sched_domain wide view of the full task_group tree. We compute the domain wide view when walking down the hierarchy, and readjust the weights when walking back up. After collecting and readjusting the domain wide view, we try to balance the tasks within the task_groups. The current approach is a naively balance each task group until we've moved the targeted amount of load. Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP paper. XXX: there will be some numerical issues due to the limited nature of SCHED_LOAD_SCALE wrt to representing a task_groups influence on the total weight. When the tree is deep enough, or the task weight small enough, we'll run out of bits. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> CC: Abhishek Chandra <chandra@cs.umn.edu> CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-04-19 13:45:00 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-04-19 13:45:00 -0400
commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e (patch)
tree: fa85b700aa3caac5b1309edd8e31d9b957957a83 /kernel/sched_fair.c
parent: 1d3504fcf5606579d60b649d19f44b3871c1ddae (diff)
1 files changed, 80 insertions, 44 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b43748efaa7f..b89fec93a237 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+        cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
 }
@@ -504,6 +521,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
 }
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move, struct sched_domain *sd,
+                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+                struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *curr;
+        struct rq_iterator cfs_rq_iterator;
-        struct task_struct *p;
-        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-                return MAX_PRIO;
-        curr = cfs_rq->curr;
-        if (!curr)
-                curr = __pick_next_entity(cfs_rq);
-        p = task_of(curr);
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
+        cfs_rq_iterator.arg = cfs_rq;
-        return p->prio;
+        return balance_tasks(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &cfs_rq_iterator);
 }
-#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-        struct rq_iterator cfs_rq_iterator;
+        int busiest_cpu = cpu_of(busiest);
+        struct task_group *tg;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        rcu_read_lock();
-        cfs_rq_iterator.next = load_balance_next_fair;
+        list_for_each_entry(tg, &task_groups, list) {
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-                struct cfs_rq *this_cfs_rq;
                long imbalance;
-                unsigned long maxload;
+                unsigned long this_weight, busiest_weight;
+                long rem_load, max_load, moved_load;
+                /*
+                 * empty group
+                 */
+                if (!aggregate(tg, sd)->task_weight)
+                        continue;
+                rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+                rem_load /= aggregate(tg, sd)->load + 1;
-                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+                this_weight = tg->cfs_rq[this_cpu]->task_weight;
+                busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                imbalance = (busiest_weight - this_weight) / 2;
-                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-                if (imbalance <= 0)
+                if (imbalance < 0)
+                        imbalance = busiest_weight;
+                max_load = max(rem_load, imbalance);
+                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                                max_load, sd, idle, all_pinned, this_best_prio,
+                                tg->cfs_rq[busiest_cpu]);
+                if (!moved_load)
                        continue;
-                /* Don't pull more than imbalance/2 */
+                move_group_shares(tg, sd, busiest_cpu, this_cpu);
-                imbalance /= 2;
-                maxload = min(rem_load_move, imbalance);
-                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+                moved_load *= aggregate(tg, sd)->load;
-#else
+                moved_load /= aggregate(tg, sd)->rq_weight + 1;
-# define maxload rem_load_move
-#endif
-                /*
-                 * pass busy_cfs_rq argument into
-                 * load_balance_[start|next]_fair iterators
-                 */
-                cfs_rq_iterator.arg = busy_cfs_rq;
-                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                               maxload, sd, idle, all_pinned,
-                                               this_best_prio,
-                                               &cfs_rq_iterator);
-                if (rem_load_move <= 0)
+                rem_load_move -= moved_load;
+                if (rem_load_move < 0)
                        break;
        }
+        rcu_read_unlock();
        return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                  unsigned long max_load_move,
+                  struct sched_domain *sd, enum cpu_idle_type idle,
+                  int *all_pinned, int *this_best_prio)
+{
+        return __load_balance_fair(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &busiest->cfs);
+}
+#endif
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-04-19 13:45:00 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-04-19 13:45:00 -0400
commit	18d95a2832c1392a2d63227a7a6d433cb9f2037e (patch)
tree	fa85b700aa3caac5b1309edd8e31d9b957957a83 /kernel/sched_fair.c
parent	1d3504fcf5606579d60b649d19f44b3871c1ddae (diff)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b43748efaa7f..b89fec93a237 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
492	* Scheduling class queueing methods:	492	* Scheduling class queueing methods:
493	*/	493	*/
494		494
		495	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
		496	static void
		497	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
		498	{
		499	cfs_rq->task_weight += weight;
		500	}
		501	#else
		502	static inline void
		503	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
		504	{
		505	}
		506	#endif
		507
495	static void	508	static void
496	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	509	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
497	{	510	{
498	update_load_add(&cfs_rq->load, se->load.weight);	511	update_load_add(&cfs_rq->load, se->load.weight);
		512	if (!parent_entity(se))
		513	inc_cpu_load(rq_of(cfs_rq), se->load.weight);
		514	if (entity_is_task(se))
		515	add_cfs_task_weight(cfs_rq, se->load.weight);
499	cfs_rq->nr_running++;	516	cfs_rq->nr_running++;
500	se->on_rq = 1;	517	se->on_rq = 1;
501	}	518	}
@@ -504,6 +521,10 @@ static void
504	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)	521	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
505	{	522	{
506	update_load_sub(&cfs_rq->load, se->load.weight);	523	update_load_sub(&cfs_rq->load, se->load.weight);
		524	if (!parent_entity(se))
		525	dec_cpu_load(rq_of(cfs_rq), se->load.weight);
		526	if (entity_is_task(se))
		527	add_cfs_task_weight(cfs_rq, -se->load.weight);
507	cfs_rq->nr_running--;	528	cfs_rq->nr_running--;
508	se->on_rq = 0;	529	se->on_rq = 0;
509	}	530	}
@@ -1286,75 +1307,90 @@ static struct task_struct load_balance_next_fair(void arg)
1286	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);	1307	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
1287	}	1308	}
1288		1309
1289	#ifdef CONFIG_FAIR_GROUP_SCHED	1310	static unsigned long
1290	static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)	1311	__load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
		1312	unsigned long max_load_move, struct sched_domain *sd,
		1313	enum cpu_idle_type idle, int all_pinned, int this_best_prio,
		1314	struct cfs_rq *cfs_rq)
1291	{	1315	{
1292	struct sched_entity *curr;	1316	struct rq_iterator cfs_rq_iterator;
1293	struct task_struct *p;
1294
1295	if (!cfs_rq->nr_running \|\| !first_fair(cfs_rq))
1296	return MAX_PRIO;
1297
1298	curr = cfs_rq->curr;
1299	if (!curr)
1300	curr = __pick_next_entity(cfs_rq);
1301		1317
1302	p = task_of(curr);	1318	cfs_rq_iterator.start = load_balance_start_fair;
		1319	cfs_rq_iterator.next = load_balance_next_fair;
		1320	cfs_rq_iterator.arg = cfs_rq;
1303		1321
1304	return p->prio;	1322	return balance_tasks(this_rq, this_cpu, busiest,
		1323	max_load_move, sd, idle, all_pinned,
		1324	this_best_prio, &cfs_rq_iterator);
1305	}	1325	}
1306	#endif
1307		1326
		1327	#ifdef CONFIG_FAIR_GROUP_SCHED
1308	static unsigned long	1328	static unsigned long
1309	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1329	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1310	unsigned long max_load_move,	1330	unsigned long max_load_move,
1311	struct sched_domain *sd, enum cpu_idle_type idle,	1331	struct sched_domain *sd, enum cpu_idle_type idle,
1312	int all_pinned, int this_best_prio)	1332	int all_pinned, int this_best_prio)
1313	{	1333	{
1314	struct cfs_rq *busy_cfs_rq;
1315	long rem_load_move = max_load_move;	1334	long rem_load_move = max_load_move;
1316	struct rq_iterator cfs_rq_iterator;	1335	int busiest_cpu = cpu_of(busiest);
		1336	struct task_group *tg;
1317		1337
1318	cfs_rq_iterator.start = load_balance_start_fair;	1338	rcu_read_lock();
1319	cfs_rq_iterator.next = load_balance_next_fair;	1339	list_for_each_entry(tg, &task_groups, list) {
1320
1321	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1322	#ifdef CONFIG_FAIR_GROUP_SCHED
1323	struct cfs_rq *this_cfs_rq;
1324	long imbalance;	1340	long imbalance;
1325	unsigned long maxload;	1341	unsigned long this_weight, busiest_weight;
		1342	long rem_load, max_load, moved_load;
		1343
		1344	/*
		1345	* empty group
		1346	*/
		1347	if (!aggregate(tg, sd)->task_weight)
		1348	continue;
		1349
		1350	rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
		1351	rem_load /= aggregate(tg, sd)->load + 1;
1326		1352
1327	this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);	1353	this_weight = tg->cfs_rq[this_cpu]->task_weight;
		1354	busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1328		1355
1329	imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;	1356	imbalance = (busiest_weight - this_weight) / 2;
1330	/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */	1357
1331	if (imbalance <= 0)	1358	if (imbalance < 0)
		1359	imbalance = busiest_weight;
		1360
		1361	max_load = max(rem_load, imbalance);
		1362	moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
		1363	max_load, sd, idle, all_pinned, this_best_prio,
		1364	tg->cfs_rq[busiest_cpu]);
		1365
		1366	if (!moved_load)
1332	continue;	1367	continue;
1333		1368
1334	/* Don't pull more than imbalance/2 */	1369	move_group_shares(tg, sd, busiest_cpu, this_cpu);
1335	imbalance /= 2;
1336	maxload = min(rem_load_move, imbalance);
1337		1370
1338	*this_best_prio = cfs_rq_best_prio(this_cfs_rq);	1371	moved_load *= aggregate(tg, sd)->load;
1339	#else	1372	moved_load /= aggregate(tg, sd)->rq_weight + 1;
1340	# define maxload rem_load_move
1341	#endif
1342	/*
1343	* pass busy_cfs_rq argument into
1344	* load_balance_[start\|next]_fair iterators
1345	*/
1346	cfs_rq_iterator.arg = busy_cfs_rq;
1347	rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1348	maxload, sd, idle, all_pinned,
1349	this_best_prio,
1350	&cfs_rq_iterator);
1351		1373
1352	if (rem_load_move <= 0)	1374	rem_load_move -= moved_load;
		1375	if (rem_load_move < 0)
1353	break;	1376	break;
1354	}	1377	}
		1378	rcu_read_unlock();
1355		1379
1356	return max_load_move - rem_load_move;	1380	return max_load_move - rem_load_move;
1357	}	1381	}
		1382	#else
		1383	static unsigned long
		1384	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
		1385	unsigned long max_load_move,
		1386	struct sched_domain *sd, enum cpu_idle_type idle,
		1387	int all_pinned, int this_best_prio)
		1388	{
		1389	return __load_balance_fair(this_rq, this_cpu, busiest,
		1390	max_load_move, sd, idle, all_pinned,
		1391	this_best_prio, &busiest->cfs);
		1392	}
		1393	#endif
1358		1394
1359	static int	1395	static int
1360	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1396	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,