revert ("sched: fair-group: SMP-nice for group scheduling")

Yanmin Zhang reported: Comparing with 2.6.25, volanoMark has big regression with kernel 2.6.26-rc1. It's about 50% on my 8-core stoakley, 16-core tigerton, and Itanium Montecito. With bisect, I located the following patch: | 18d95a2832c1392a2d63227a7a6d433cb9f2037e is first bad commit | commit 18d95a2832c1392a2d63227a7a6d433cb9f2037e | Author: Peter Zijlstra <a.p.zijlstra@chello.nl> | Date: Sat Apr 19 19:45:00 2008 +0200 | | sched: fair-group: SMP-nice for group scheduling Revert it so that we get v2.6.25 behavior. Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2008-05-29 05:28:57 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-05-29 05:28:57 -0400
commit: 6363ca57c76b7b83639ca8c83fc285fa26a7880e (patch)
tree: b8630b4af286409efdd648920a546fae24d4db88 /kernel/sched_fair.c
parent: 4285f594f84d1f0641fc962d00e6638dec4a19c4 (diff)
1 files changed, 44 insertions, 80 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0eb0ae879542..f0f25fc12d0a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-        cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
-                add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
        list_add(&se->group_node, &cfs_rq->tasks);
@@ -540,10 +523,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
-                add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
        list_del_init(&se->group_node);
@@ -1327,90 +1306,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
-static unsigned long
+#ifdef CONFIG_FAIR_GROUP_SCHED
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-                unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-                struct cfs_rq *cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        struct sched_entity *curr;
+        struct task_struct *p;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-        cfs_rq_iterator.next = load_balance_next_fair;
+                return MAX_PRIO;
-        cfs_rq_iterator.arg = cfs_rq;
+        curr = cfs_rq->curr;
+        if (!curr)
+                curr = __pick_next_entity(cfs_rq);
-        return balance_tasks(this_rq, this_cpu, busiest,
+        p = task_of(curr);
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
+        return p->prio;
 }
+#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
+        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-        int busiest_cpu = cpu_of(busiest);
+        struct rq_iterator cfs_rq_iterator;
-        struct task_group *tg;
-        rcu_read_lock();
-        list_for_each_entry(tg, &task_groups, list) {
-                long imbalance;
-                unsigned long this_weight, busiest_weight;
-                long rem_load, max_load, moved_load;
-                /*
-                 * empty group
-                 */
-                if (!aggregate(tg, sd)->task_weight)
-                        continue;
-                rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-                rem_load /= aggregate(tg, sd)->load + 1;
-                this_weight = tg->cfs_rq[this_cpu]->task_weight;
-                busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-                imbalance = (busiest_weight - this_weight) / 2;
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
-                if (imbalance < 0)
+        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-                        imbalance = busiest_weight;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+                struct cfs_rq *this_cfs_rq;
+                long imbalance;
+                unsigned long maxload;
-                max_load = max(rem_load, imbalance);
+                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-                                max_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
-                if (!moved_load)
+                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                if (imbalance <= 0)
                        continue;
-                move_group_shares(tg, sd, busiest_cpu, this_cpu);
+                /* Don't pull more than imbalance/2 */
+                imbalance /= 2;
+                maxload = min(rem_load_move, imbalance);
-                moved_load *= aggregate(tg, sd)->load;
+                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-                moved_load /= aggregate(tg, sd)->rq_weight + 1;
+#else
+# define maxload rem_load_move
+#endif
+                /*
+                 * pass busy_cfs_rq argument into
+                 * load_balance_[start|next]_fair iterators
+                 */
+                cfs_rq_iterator.arg = busy_cfs_rq;
+                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+                                               maxload, sd, idle, all_pinned,
+                                               this_best_prio,
+                                               &cfs_rq_iterator);
-                rem_load_move -= moved_load;
+                if (rem_load_move <= 0)
-                if (rem_load_move < 0)
                        break;
        }
-        rcu_read_unlock();
        return max_load_move - rem_load_move;
 }
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return __load_balance_fair(this_rq, this_cpu, busiest,
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &busiest->cfs);
-}
-#endif
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
author	Ingo Molnar <mingo@elte.hu>	2008-05-29 05:28:57 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-05-29 05:28:57 -0400
commit	6363ca57c76b7b83639ca8c83fc285fa26a7880e (patch)
tree	b8630b4af286409efdd648920a546fae24d4db88 /kernel/sched_fair.c
parent	4285f594f84d1f0641fc962d00e6638dec4a19c4 (diff)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0eb0ae879542..f0f25fc12d0a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
510	* Scheduling class queueing methods:	510	* Scheduling class queueing methods:
511	*/	511	*/
512		512
513	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
514	static void
515	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
516	{
517	cfs_rq->task_weight += weight;
518	}
519	#else
520	static inline void
521	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
522	{
523	}
524	#endif
525
526	static void	513	static void
527	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	514	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
528	{	515	{
529	update_load_add(&cfs_rq->load, se->load.weight);	516	update_load_add(&cfs_rq->load, se->load.weight);
530	if (!parent_entity(se))
531	inc_cpu_load(rq_of(cfs_rq), se->load.weight);
532	if (entity_is_task(se))
533	add_cfs_task_weight(cfs_rq, se->load.weight);
534	cfs_rq->nr_running++;	517	cfs_rq->nr_running++;
535	se->on_rq = 1;	518	se->on_rq = 1;
536	list_add(&se->group_node, &cfs_rq->tasks);	519	list_add(&se->group_node, &cfs_rq->tasks);
@@ -540,10 +523,6 @@ static void
540	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)	523	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
541	{	524	{
542	update_load_sub(&cfs_rq->load, se->load.weight);	525	update_load_sub(&cfs_rq->load, se->load.weight);
543	if (!parent_entity(se))
544	dec_cpu_load(rq_of(cfs_rq), se->load.weight);
545	if (entity_is_task(se))
546	add_cfs_task_weight(cfs_rq, -se->load.weight);
547	cfs_rq->nr_running--;	526	cfs_rq->nr_running--;
548	se->on_rq = 0;	527	se->on_rq = 0;
549	list_del_init(&se->group_node);	528	list_del_init(&se->group_node);
@@ -1327,90 +1306,75 @@ static struct task_struct load_balance_next_fair(void arg)
1327	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);	1306	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1328	}	1307	}
1329		1308
1330	static unsigned long	1309	#ifdef CONFIG_FAIR_GROUP_SCHED
1331	__load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1310	static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1332	unsigned long max_load_move, struct sched_domain *sd,
1333	enum cpu_idle_type idle, int all_pinned, int this_best_prio,
1334	struct cfs_rq *cfs_rq)
1335	{	1311	{
1336	struct rq_iterator cfs_rq_iterator;	1312	struct sched_entity *curr;
		1313	struct task_struct *p;
1337		1314
1338	cfs_rq_iterator.start = load_balance_start_fair;	1315	if (!cfs_rq->nr_running \|\| !first_fair(cfs_rq))
1339	cfs_rq_iterator.next = load_balance_next_fair;	1316	return MAX_PRIO;
1340	cfs_rq_iterator.arg = cfs_rq;	1317
		1318	curr = cfs_rq->curr;
		1319	if (!curr)
		1320	curr = __pick_next_entity(cfs_rq);
1341		1321
1342	return balance_tasks(this_rq, this_cpu, busiest,	1322	p = task_of(curr);
1343	max_load_move, sd, idle, all_pinned,	1323
1344	this_best_prio, &cfs_rq_iterator);	1324	return p->prio;
1345	}	1325	}
		1326	#endif
1346		1327
1347	#ifdef CONFIG_FAIR_GROUP_SCHED
1348	static unsigned long	1328	static unsigned long
1349	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1329	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1350	unsigned long max_load_move,	1330	unsigned long max_load_move,
1351	struct sched_domain *sd, enum cpu_idle_type idle,	1331	struct sched_domain *sd, enum cpu_idle_type idle,
1352	int all_pinned, int this_best_prio)	1332	int all_pinned, int this_best_prio)
1353	{	1333	{
		1334	struct cfs_rq *busy_cfs_rq;
1354	long rem_load_move = max_load_move;	1335	long rem_load_move = max_load_move;
1355	int busiest_cpu = cpu_of(busiest);	1336	struct rq_iterator cfs_rq_iterator;
1356	struct task_group *tg;
1357
1358	rcu_read_lock();
1359	list_for_each_entry(tg, &task_groups, list) {
1360	long imbalance;
1361	unsigned long this_weight, busiest_weight;
1362	long rem_load, max_load, moved_load;
1363
1364	/*
1365	* empty group
1366	*/
1367	if (!aggregate(tg, sd)->task_weight)
1368	continue;
1369
1370	rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1371	rem_load /= aggregate(tg, sd)->load + 1;
1372
1373	this_weight = tg->cfs_rq[this_cpu]->task_weight;
1374	busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1375		1337
1376	imbalance = (busiest_weight - this_weight) / 2;	1338	cfs_rq_iterator.start = load_balance_start_fair;
		1339	cfs_rq_iterator.next = load_balance_next_fair;
1377		1340
1378	if (imbalance < 0)	1341	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1379	imbalance = busiest_weight;	1342	#ifdef CONFIG_FAIR_GROUP_SCHED
		1343	struct cfs_rq *this_cfs_rq;
		1344	long imbalance;
		1345	unsigned long maxload;
1380		1346
1381	max_load = max(rem_load, imbalance);	1347	this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1382	moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1383	max_load, sd, idle, all_pinned, this_best_prio,
1384	tg->cfs_rq[busiest_cpu]);
1385		1348
1386	if (!moved_load)	1349	imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
		1350	/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
		1351	if (imbalance <= 0)
1387	continue;	1352	continue;
1388		1353
1389	move_group_shares(tg, sd, busiest_cpu, this_cpu);	1354	/* Don't pull more than imbalance/2 */
		1355	imbalance /= 2;
		1356	maxload = min(rem_load_move, imbalance);
1390		1357
1391	moved_load *= aggregate(tg, sd)->load;	1358	*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1392	moved_load /= aggregate(tg, sd)->rq_weight + 1;	1359	#else
		1360	# define maxload rem_load_move
		1361	#endif
		1362	/*
		1363	* pass busy_cfs_rq argument into
		1364	* load_balance_[start\|next]_fair iterators
		1365	*/
		1366	cfs_rq_iterator.arg = busy_cfs_rq;
		1367	rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
		1368	maxload, sd, idle, all_pinned,
		1369	this_best_prio,
		1370	&cfs_rq_iterator);
1393		1371
1394	rem_load_move -= moved_load;	1372	if (rem_load_move <= 0)
1395	if (rem_load_move < 0)
1396	break;	1373	break;
1397	}	1374	}
1398	rcu_read_unlock();
1399		1375
1400	return max_load_move - rem_load_move;	1376	return max_load_move - rem_load_move;
1401	}	1377	}
1402	#else
1403	static unsigned long
1404	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1405	unsigned long max_load_move,
1406	struct sched_domain *sd, enum cpu_idle_type idle,
1407	int all_pinned, int this_best_prio)
1408	{
1409	return __load_balance_fair(this_rq, this_cpu, busiest,
1410	max_load_move, sd, idle, all_pinned,
1411	this_best_prio, &busiest->cfs);
1412	}
1413	#endif
1414		1378
1415	static int	1379	static int
1416	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1380	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,