From 58b26c4c025778c09c7a1438ff185080e11b7d0a Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 10 Sep 2010 18:19:17 -0700
Subject: sched: Increment cache_nice_tries only on periodic lb

scheduler uses cache_nice_tries as an indicator to do cache_hot and
active load balance, when normal load balance fails. Currently,
this value is changed on any failed load balance attempt. That ends
up being not so nice to workloads that enter/exit idle often, as
they do more frequent new_idle balance and that pretty soon results
in cache hot tasks being pulled in.

Making the cache_nice_tries ignore failed new_idle balance seems to
make better sense. With that only the failed load balance in
periodic load balance gets accounted and the rate of accumulation
of cache_nice_tries will not depend on idle entry/exit (short
running sleep-wakeup kind of tasks). This reduces movement of
cache_hot tasks.

schedstat diff (after-before) excerpt from a workload that has
frequent and short wakeup-idle pattern (:2 in cpu col below refers
to NEWIDLE idx) This snapshot was across ~400 seconds.

Without this change:
domainstats:  domain0
 cpu     cnt      bln      fld      imb     gain    hgain  nobusyq  nobusyg
 0:2  306487   219575    73167  110069413    44583    19070     1172   218403
 1:2  292139   194853    81421  120893383    50745    21902     1259   193594
 2:2  283166   174607    91359  129699642    54931    23688     1287   173320
 3:2  273998   161788    93991  132757146    57122    24351     1366   160422
 4:2  289851   215692    62190  83398383    36377    13680      851   214841
 5:2  316312   222146    77605  117582154    49948    20281      988   221158
 6:2  297172   195596    83623  122133390    52801    21301      929   194667
 7:2  283391   178078    86378  126622761    55122    22239      928   177150
 8:2  297655   210359    72995  110246694    45798    19777     1125   209234
 9:2  297357   202011    79363  119753474    50953    22088     1089   200922
10:2  278797   178703    83180  122514385    52969    22726     1128   177575
11:2  272661   167669    86978  127342327    55857    24342     1195   166474
12:2  293039   204031    73211  110282059    47285    19651      948   203083
13:2  289502   196762    76803  114712942    49339    20547     1016   195746
14:2  264446   169609    78292  115715605    50459    21017      982   168627
15:2  260968   163660    80142  116811793    51483    21281     1064   162596

With this change:
domainstats:  domain0
 cpu     cnt      bln      fld      imb     gain    hgain  nobusyq  nobusyg
 0:2  272347   187380    77455  105420270    24975        1      953   186427
 1:2  267276   172360    86234  116242264    28087        6     1028   171332
 2:2  259769   156777    93281  123243134    30555        1     1043   155734
 3:2  250870   143129    97627  127370868    32026        6     1188   141941
 4:2  248422   177116    64096  78261112    22202        2      757   176359
 5:2  275595   180683    84950  116075022    29400        6      778   179905
 6:2  262418   162609    88944  119256898    31056        4      817   161792
 7:2  252204   147946    92646  122388300    32879        4      824   147122
 8:2  262335   172239    81631  110477214    26599        4      864   171375
 9:2  261563   164775    88016  117203621    28331        3      849   163926
10:2  243389   140949    93379  121353071    29585        2      909   140040
11:2  242795   134651    98310  124768957    30895        2     1016   133635
12:2  255234   166622    79843  104696912    26483        4      746   165876
13:2  244944   151595    83855  109808099    27787        3      801   150794
14:2  241301   140982    89935  116954383    30403        6      845   140137
15:2  232271   128564    92821  119185207    31207        4     1416   127148

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1284167957-3675-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a171138a9402..aa16cf1eb8fe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3031,7 +3031,14 @@ redo:
 
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
-		sd->nr_balance_failed++;
+		/*
+		 * Increment the failure counter only on periodic balance.
+		 * We do not want newidle balance, which can be very
+		 * frequent, pollute the failure counter causing
+		 * excessive cache_hot migrations and active balances.
+		 */
+		if (idle != CPU_NEWLY_IDLE)
+			sd->nr_balance_failed++;
 
 		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
 					this_cpu)) {
-- 
cgit v1.2.2


From b0a0f667a349247bd7f05f806b662a25653822bc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 6 Oct 2010 17:32:51 -0700
Subject: sched: suppress RCU lockdep splat in task_fork_fair

> ===================================================
> [ INFO: suspicious rcu_dereference_check() usage. ]
> ---------------------------------------------------
> /home/greearb/git/linux.wireless-testing/kernel/sched.c:618 invoked rcu_dereference_check() without protection!
>
> other info that might help us debug this:
>
> rcu_scheduler_active = 1, debug_locks = 1
> 1 lock held by ifup/23517:
>   #0:  (&rq->lock){-.-.-.}, at: [<c042f782>] task_fork_fair+0x3b/0x108
>
> stack backtrace:
> Pid: 23517, comm: ifup Not tainted 2.6.36-rc6-wl+ #5
> Call Trace:
>   [<c075e219>] ? printk+0xf/0x16
>   [<c0455842>] lockdep_rcu_dereference+0x74/0x7d
>   [<c0426854>] task_group+0x6d/0x79
>   [<c042686e>] set_task_rq+0xe/0x57
>   [<c042f79e>] task_fork_fair+0x57/0x108
>   [<c042e965>] sched_fork+0x82/0xf9
>   [<c04334b3>] copy_process+0x569/0xe8e
>   [<c0433ef0>] do_fork+0x118/0x262
>   [<c076302f>] ? do_page_fault+0x16a/0x2cf
>   [<c044b80c>] ? up_read+0x16/0x2a
>   [<c04085ae>] sys_clone+0x1b/0x20
>   [<c04030a5>] ptregs_clone+0x15/0x30
>   [<c0402f1c>] ? sysenter_do_call+0x12/0x38

Here a newly created task is having its runqueue assigned.  The new task
is not yet on the tasklist, so cannot go away.  This is therefore a false
positive, suppress with an RCU read-side critical section.

Reported-by: Ben Greear <greearb@candelatech.com
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Ben Greear <greearb@candelatech.com
---
 kernel/sched_fair.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49d..5f996d36ac5d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3751,8 +3751,11 @@ static void task_fork_fair(struct task_struct *p)
 
 	update_rq_clock(rq);
 
-	if (unlikely(task_cpu(p) != this_cpu))
+	if (unlikely(task_cpu(p) != this_cpu)) {
+		rcu_read_lock();
 		__set_task_cpu(p, this_cpu);
+		rcu_read_unlock();
+	}
 
 	update_curr(cfs_rq);
 
-- 
cgit v1.2.2


From 864616ee6785d9fac7a2cd80c01a2da89579f2e4 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 14 Oct 2010 16:09:13 +0900
Subject: sched: Comment updates: fix default latency and granularity numbers

Targeted preemption latency and minimal preemption granularity
for CPU-bound tasks have been changed.

This patch updates the comments about these values.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20101014160913.eb24fef4.yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 623e9aceef8f..bf87192e97fe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-- 
cgit v1.2.2


From 2582f0eba54066b5e98ff2b27ef0cfa833b59f54 Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Wed, 13 Oct 2010 12:09:36 -0700
Subject: sched: Set group_imb only a task can be pulled from the busiest cpu

When cycling through sched groups to determine the busiest group, set
group_imb only if the busiest cpu has more than 1 runnable task. This patch
fixes the case where two cpus in a group have one runnable task each, but there
is a large weight differential between these two tasks. The load balancer is
unable to migrate any task from this group, and hence do not consider this
group to be imbalanced.

Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1286996978-7007-3-git-send-email-ncrao@google.com>
[ small code readability edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bf87192e97fe..3656480e0f79 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2378,7 +2378,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
-	unsigned long load, max_cpu_load, min_cpu_load;
+	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
 	int i;
 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
 	unsigned long avg_load_per_task = 0;
@@ -2389,6 +2389,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	/* Tally up the load of all CPUs in the group */
 	max_cpu_load = 0;
 	min_cpu_load = ~0UL;
+	max_nr_running = 0;
 
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
@@ -2406,8 +2407,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 			load = target_load(i, load_idx);
 		} else {
 			load = source_load(i, load_idx);
-			if (load > max_cpu_load)
+			if (load > max_cpu_load) {
 				max_cpu_load = load;
+				max_nr_running = rq->nr_running;
+			}
 			if (min_cpu_load > load)
 				min_cpu_load = load;
 		}
@@ -2447,11 +2450,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (sgs->sum_nr_running)
 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity =
-		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
 }
-- 
cgit v1.2.2


From fab476228ba37907ad75216d0fd9732ada9c119e Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Fri, 15 Oct 2010 13:12:29 -0700
Subject: sched: Force balancing on newidle balance if local group has capacity

This patch forces a load balance on a newly idle cpu when the local group has
extra capacity and the busiest group does not have any. It improves system
utilization when balancing tasks with a large weight differential.

Under certain situations, such as a niced down task (i.e. nice = -15) in the
presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and
kicks away other tasks because of its large weight. This leads to sub-optimal
utilization of the machine. Even though the sched group has capacity, it does
not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL.

With this patch, if the local group has extra capacity, we shortcut the checks
in f_b_g() and try to pull a task over. A sched group has extra capacity if the
group capacity is greater than the number of running tasks in that group.

Thanks to Mike Galbraith for discussions leading to this patch and for the
insight to reuse SD_NEWIDLE_BALANCE.

Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1287173550-30365-4-git-send-email-ncrao@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3656480e0f79..032b548be0fc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	check_preempt_curr(this_rq, p, 0);
+
+	/* re-arm NEWIDLE balancing when moving tasks */
+	src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+	this_rq->idle_stamp = 0;
 }
 
 /*
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
 	unsigned long this_load;
 	unsigned long this_load_per_task;
 	unsigned long this_nr_running;
+	unsigned long this_has_capacity;
 
 	/* Statistics of the busiest group */
 	unsigned long max_load;
 	unsigned long busiest_load_per_task;
 	unsigned long busiest_nr_running;
 	unsigned long busiest_group_capacity;
+	unsigned long busiest_has_capacity;
 
 	int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long group_capacity;
 	int group_imb; /* Is there an imbalance in the group ? */
+	int group_has_capacity; /* Is there extra capacity in the group? */
 };
 
 /**
@@ -2456,6 +2463,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
+
+	if (sgs->group_capacity > sgs->sum_nr_running)
+		sgs->group_has_capacity = 1;
 }
 
 /**
@@ -2554,12 +2564,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			sds->this = sg;
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
+			sds->this_has_capacity = sgs.group_has_capacity;
 		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
 			sds->max_load = sgs.avg_load;
 			sds->busiest = sg;
 			sds->busiest_nr_running = sgs.sum_nr_running;
 			sds->busiest_group_capacity = sgs.group_capacity;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
+			sds->busiest_has_capacity = sgs.group_has_capacity;
 			sds->group_imb = sgs.group_imb;
 		}
 
@@ -2756,6 +2768,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 		return fix_small_imbalance(sds, this_cpu, imbalance);
 
 }
+
 /******* find_busiest_group() helpers end here *********************/
 
 /**
@@ -2807,6 +2820,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * 4) This group is more busy than the avg busieness at this
 	 *    sched_domain.
 	 * 5) The imbalance is within the specified limit.
+	 *
+	 * Note: when doing newidle balance, if the local group has excess
+	 * capacity (i.e. nr_running < group_capacity) and the busiest group
+	 * does not have any capacity, we force a load balance to pull tasks
+	 * to the local group. In this case, we skip past checks 3, 4 and 5.
 	 */
 	if (!(*balance))
 		goto ret;
@@ -2818,6 +2836,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	/*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+			!sds.busiest_has_capacity)
+		goto force_balance;
+
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
@@ -2829,6 +2852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
+force_balance:
 	/* Looks like there is an imbalance. Compute it */
 	calculate_imbalance(&sds, this_cpu, imbalance);
 	return sds.busiest;
@@ -3162,10 +3186,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task) {
-			this_rq->idle_stamp = 0;
+		if (pulled_task)
 			break;
-		}
 	}
 
 	raw_spin_lock(&this_rq->lock);
-- 
cgit v1.2.2


From 75dd321d79d495a0ee579e6249ebc38ddbb2667f Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Fri, 15 Oct 2010 13:12:30 -0700
Subject: sched: Drop group_capacity to 1 only if local group has extra
 capacity

When SD_PREFER_SIBLING is set on a sched domain, drop group_capacity to 1
only if the local group has extra capacity. The extra check prevents the case
where you always pull from the heaviest group when it is already under-utilized
(possible with a large weight task outweighs the tasks on the system).

For example, consider a 16-cpu quad-core quad-socket machine with MC and NUMA
scheduling domains. Let's say we spawn 15 nice0 tasks and one nice-15 task,
and each task is running on one core. In this case, we observe the following
events when balancing at the NUMA domain:

- find_busiest_group() will always pick the sched group containing the niced
  task to be the busiest group.
- find_busiest_queue() will then always pick one of the cpus running the
  nice0 task (never picks the cpu with the nice -15 task since
  weighted_cpuload > imbalance).
- The load balancer fails to migrate the task since it is the running task
  and increments sd->nr_balance_failed.
- It repeats the above steps a few more times until sd->nr_balance_failed > 5,
  at which point it kicks off the active load balancer, wakes up the migration
  thread and kicks the nice 0 task off the cpu.

The load balancer doesn't stop until we kick out all nice 0 tasks from
the sched group, leaving you with 3 idle cpus and one cpu running the
nice -15 task.

When balancing at the NUMA domain, we drop sgs.group_capacity to 1 if the child
domain (in this case MC) has SD_PREFER_SIBLING set.  Subsequent load checks are
not relevant because the niced task has a very large weight.

In this patch, we add an extra condition to the "if(prefer_sibling)" check in
update_sd_lb_stats(). We drop the capacity of a group only if the local group
has extra capacity, ie. nr_running < group_capacity. This patch preserves the
original intent of the prefer_siblings check (to spread tasks across the system
in low utilization scenarios) and fixes the case above.

It helps in the following ways:
- In low utilization cases (where nr_tasks << nr_cpus), we still drop
  group_capacity down to 1 if we prefer siblings.
- On very busy systems (where nr_tasks >> nr_cpus), sgs.nr_running will most
  likely be > sgs.group_capacity.
- When balancing large weight tasks, if the local group does not have extra
  capacity, we do not pick the group with the niced task as the busiest group.
  This prevents failed balances, active migration and the under-utilization
  described above.

Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1287173550-30365-5-git-send-email-ncrao@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 032b548be0fc..f1c615ff39d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2554,9 +2554,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		/*
 		 * In case the child domain prefers tasks go to siblings
 		 * first, lower the sg capacity to one so that we'll try
-		 * and move all the excess tasks away.
+		 * and move all the excess tasks away. We lower the capacity
+		 * of a group only if the local group has the capacity to fit
+		 * these excess tasks, i.e. nr_running < group_capacity. The
+		 * extra check prevents the case where you always pull from the
+		 * heaviest group when it is already under-utilized (possible
+		 * with a large weight task outweighs the tasks on the system).
 		 */
-		if (prefer_sibling)
+		if (prefer_sibling && !local_group && sds->this_has_capacity)
 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
 		if (local_group) {
-- 
cgit v1.2.2


From 305e6835e05513406fa12820e40e4a8ecb63743c Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 4 Oct 2010 17:03:21 -0700
Subject: sched: Do not account irq time to current task

Scheduler accounts both softirq and interrupt processing times to the
currently running task. This means, if the interrupt processing was
for some other task in the system, then the current task ends up being
penalized as it gets shorter runtime than otherwise.

Change sched task accounting to acoount only actual task time from
currently running task. Now update_curr(), modifies the delta_exec to
depend on rq->clock_task.

Note that this change only handles CONFIG_IRQ_TIME_ACCOUNTING case. We can
extend this to CONFIG_VIRT_CPU_ACCOUNTING with minimal effort. But, thats
for later.

This change will impact scheduling behavior in interrupt heavy conditions.

Tested on a 4-way system with eth0 handled by CPU 2 and a network heavy
task (nc) running on CPU 3 (and no RSS/RFS). With that I have CPU 2
spending 75%+ of its time in irq processing. CPU 3 spending around 35%
time running nc task.

Now, if I run another CPU intensive task on CPU 2, without this change
/proc/<pid>/schedstat shows 100% of time accounted to this task. With this
change, it rightly shows less than 25% accounted to this task as remaining
time is actually spent on irq processing.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1286237003-12406-7-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f1c615ff39d6..c358d4081b81 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_of(cfs_rq)->clock;
+	u64 now = rq_of(cfs_rq)->clock_task;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * We are starting a new run period:
 	 */
-	se->exec_start = rq_of(cfs_rq)->clock;
+	se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 
 /**************************************************
@@ -1802,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, rq->clock, sd);
+	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
 	if (!tsk_cache_hot ||
 		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-- 
cgit v1.2.2


From aa483808516ca5cacfa0e5849691f64fec25828e Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 4 Oct 2010 17:03:22 -0700
Subject: sched: Remove irq time from available CPU power

The idea was suggested by Peter Zijlstra here:

  http://marc.info/?l=linux-kernel&m=127476934517534&w=2

irq time is technically not available to the tasks running on the CPU.
This patch removes irq time from CPU power piggybacking on
sched_rt_avg_update().

Tested this by keeping CPU X busy with a network intensive task having 75%
oa a single CPU irq processing (hard+soft) on a 4-way system. And start seven
cycle soakers on the system. Without this change, there will be two tasks on
each CPU. With this change, there is a single task on irq busy CPU X and
remaining 7 tasks are spread around among other 3 CPUs.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1286237003-12406-8-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c358d4081b81..74cccfae87a8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2275,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
 	u64 total, available;
 
 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
-	available = total - rq->rt_avg;
+
+	if (unlikely(total < rq->rt_avg)) {
+		/* Ensures that power won't end up being negative */
+		available = 0;
+	} else {
+		available = total - rq->rt_avg;
+	}
 
 	if (unlikely((s64)total < SCHED_LOAD_SCALE))
 		total = SCHED_LOAD_SCALE;
-- 
cgit v1.2.2


From b2b5ce022acf5e9f52f7b78c5579994fdde191d4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Oct 2010 15:24:15 +0200
Subject: sched, cgroup: Fixup broken cgroup movement

Dima noticed that we fail to correct the ->vruntime of sleeping tasks
when we move them between cgroups.

Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1287150604.29097.1513.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74cccfae87a8..3acc2a487c18 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3866,13 +3866,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-	update_curr(cfs_rq);
+	/*
+	 * If the task was not on the rq at the time of this cgroup movement
+	 * it must have been asleep, sleeping tasks keep their ->vruntime
+	 * absolute on their old rq until wakeup (needed for the fair sleeper
+	 * bonus in place_entity()).
+	 *
+	 * If it was on the rq, we've just 'preempted' it, which does convert
+	 * ->vruntime to a relative base.
+	 *
+	 * Make sure both cases convert their relative position when migrating
+	 * to another cgroup's rq. This does somewhat interfere with the
+	 * fair sleeper stuff for the first placement, but who cares.
+	 */
+	if (!on_rq)
+		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+	set_task_rq(p, task_cpu(p));
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
 
@@ -3924,7 +3937,7 @@ static const struct sched_class fair_sched_class = {
 	.get_rr_interval	= get_rr_interval_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.moved_group		= moved_group_fair,
+	.task_move_group	= task_move_group_fair,
 #endif
 };
 
-- 
cgit v1.2.2


From aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 17 Sep 2010 15:02:32 -0700
Subject: sched: Use group weight, idle cpu metrics to fix imbalances during
 idle

Currently we consider a sched domain to be well balanced when the imbalance
is less than the domain's imablance_pct. As the number of cores and threads
are increasing, current values of imbalance_pct (for example 25% for a
NUMA domain) are not enough to detect imbalances like:

a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads),
24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another
socket. Leading to an idle HT cpu.

b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and
16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one
socket and 7 on another socket. Leaving one core in a socket idle
whereas in another socket we have a core having both its HT siblings busy.

While this issue can be fixed by decreasing the domain's imbalance_pct
(by making it a function of number of logical cpus in the domain), it
can potentially cause more task migrations across sched groups in an
overloaded case.

Fix this by using imbalance_pct only during newly_idle and busy
load balancing. And during idle load balancing, check if there
is an imbalance in number of idle cpu's across the busiest and this
sched_group or if the busiest group has more tasks than its weight that
the idle cpu in this_group can pull.

Reported-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..034c4f410b36 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2035,13 +2035,16 @@ struct sd_lb_stats {
 	unsigned long this_load_per_task;
 	unsigned long this_nr_running;
 	unsigned long this_has_capacity;
+	unsigned int  this_idle_cpus;
 
 	/* Statistics of the busiest group */
+	unsigned int  busiest_idle_cpus;
 	unsigned long max_load;
 	unsigned long busiest_load_per_task;
 	unsigned long busiest_nr_running;
 	unsigned long busiest_group_capacity;
 	unsigned long busiest_has_capacity;
+	unsigned int  busiest_group_weight;
 
 	int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2066,8 @@ struct sg_lb_stats {
 	unsigned long sum_nr_running; /* Nr tasks running in the group */
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long group_capacity;
+	unsigned long idle_cpus;
+	unsigned long group_weight;
 	int group_imb; /* Is there an imbalance in the group ? */
 	int group_has_capacity; /* Is there extra capacity in the group? */
 };
@@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
 		sgs->sum_weighted_load += weighted_cpuload(i);
-
+		if (idle_cpu(i))
+			sgs->idle_cpus++;
 	}
 
 	/*
@@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
+	sgs->group_weight = group->group_weight;
 
 	if (sgs->group_capacity > sgs->sum_nr_running)
 		sgs->group_has_capacity = 1;
@@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
 			sds->this_has_capacity = sgs.group_has_capacity;
+			sds->this_idle_cpus = sgs.idle_cpus;
 		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
 			sds->max_load = sgs.avg_load;
 			sds->busiest = sg;
 			sds->busiest_nr_running = sgs.sum_nr_running;
+			sds->busiest_idle_cpus = sgs.idle_cpus;
 			sds->busiest_group_capacity = sgs.group_capacity;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
 			sds->busiest_has_capacity = sgs.group_has_capacity;
+			sds->busiest_group_weight = sgs.group_weight;
 			sds->group_imb = sgs.group_imb;
 		}
 
@@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-		goto out_balanced;
+	/*
+	 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+	 * And to check for busy balance use !idle_cpu instead of
+	 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+	 * even when they are idle.
+	 */
+	if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+			goto out_balanced;
+	} else {
+		/*
+		 * This cpu is idle. If the busiest group load doesn't
+		 * have more tasks than the number of available cpu's and
+		 * there is no imbalance between this and busiest group
+		 * wrt to idle cpu's, it is balanced.
+		 */
+		if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+		    sds.busiest_nr_running <= sds.busiest_group_weight)
+			goto out_balanced;
+	}
 
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
-- 
cgit v1.2.2


From 1e5a74059f9059d330744eac84873b1b99657008 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 31 Oct 2010 12:37:04 +0100
Subject: sched: Fix cross-sched-class wakeup preemption

Instead of dealing with sched classes inside each check_preempt_curr()
implementation, pull out this logic into the generic wakeup preemption
path.

This fixes a hang in KVM (and others) where we are waiting for the
stop machine thread to run ...

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Tested-by: Marcelo Tosatti <mtosatti@redhat.com>
Tested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1288891946.2039.31.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 034c4f410b36..52ab113d8bb9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
-	if (unlikely(rt_prio(p->prio)))
-		goto preempt;
-
-	if (unlikely(p->sched_class != &fair_sched_class))
-		return;
-
 	if (unlikely(se == pse))
 		return;
 
-- 
cgit v1.2.2


From b5482cfa1c95a188b3054fa33274806add91bbe5 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 16 Nov 2010 17:34:02 +0800
Subject: sched: Fix volanomark performance regression

Commit fab4762 triggers excessive idle balancing, causing a ~30% loss in
volanomark throughput. Remove idle balancing throttle reset.

Originally-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1289928732.5169.211.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 52ab113d8bb9..ba0556dc7c06 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1758,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	check_preempt_curr(this_rq, p, 0);
-
-	/* re-arm NEWIDLE balancing when moving tasks */
-	src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
-	this_rq->idle_stamp = 0;
 }
 
 /*
-- 
cgit v1.2.2


From d5ad140bc1505a98c0f040937125bfcbb508078f Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Wed, 17 Nov 2010 11:42:04 -0800
Subject: sched: Fix idle balancing

An earlier commit reverts idle balancing throttling reset to fix a 30%
regression in volanomark throughput. We still need to reset idle_stamp
when we pull a task in newidle balance.

Reported-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1290022924-3548-1-git-send-email-ncrao@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba0556dc7c06..00ebd7686676 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3215,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task)
+		if (pulled_task) {
+			this_rq->idle_stamp = 0;
 			break;
+		}
 	}
 
 	raw_spin_lock(&this_rq->lock);
-- 
cgit v1.2.2


From 2069dd75c7d0f49355939e5586daf5a9ab216db7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:00 -0800
Subject: sched: Rewrite tg_shares_up)

By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.

This should improve cpu-cgroup performance for smp systems
significantly.

[ Paul: changed to use queueing cfs_rq + bug fixes ]

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 164 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 107 insertions(+), 57 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..d86544b4151c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 	WRT_SYSCTL(sched_min_granularity);
 	WRT_SYSCTL(sched_latency);
 	WRT_SYSCTL(sched_wakeup_granularity);
-	WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
 
 	return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		list_add(&se->group_node, &cfs_rq->tasks);
 	}
 	cfs_rq->nr_running++;
-	se->on_rq = 1;
 }
 
 static void
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		list_del_init(&se->group_node);
 	}
 	cfs_rq->nr_running--;
-	se->on_rq = 0;
 }
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+	u64 period = sched_avg_period();
+	u64 now, delta;
+
+	if (!cfs_rq)
+		return;
+
+	now = rq_of(cfs_rq)->clock;
+	delta = now - cfs_rq->load_stamp;
+
+	cfs_rq->load_stamp = now;
+	cfs_rq->load_period += delta;
+	cfs_rq->load_avg += delta * cfs_rq->load.weight;
+
+	while (cfs_rq->load_period > period) {
+		/*
+		 * Inline assembly required to prevent the compiler
+		 * optimising this loop into a divmod call.
+		 * See __iter_div_u64_rem() for another example of this.
+		 */
+		asm("" : "+rm" (cfs_rq->load_period));
+		cfs_rq->load_period /= 2;
+		cfs_rq->load_avg /= 2;
+	}
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			    unsigned long weight)
+{
+	if (se->on_rq)
+		account_entity_dequeue(cfs_rq, se);
+
+	update_load_set(&se->load, weight);
+
+	if (se->on_rq)
+		account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+	long load_weight, load, shares;
+
+	if (!cfs_rq)
+		return;
+
+	tg = cfs_rq->tg;
+	se = tg->se[cpu_of(rq_of(cfs_rq))];
+	if (!se)
+		return;
+
+	load = cfs_rq->load.weight;
+
+	load_weight = atomic_read(&tg->load_weight);
+	load_weight -= cfs_rq->load_contribution;
+	load_weight += load;
+
+	shares = (tg->shares * load);
+	if (load_weight)
+		shares /= load_weight;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	if (shares > tg->shares)
+		shares = tg->shares;
+
+	reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_cfs_load(cfs_rq);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
+	se->on_rq = 1;
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
+	se->on_rq = 0;
+	update_cfs_load(cfs_rq);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		flags = ENQUEUE_WAKEUP;
 	}
 
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq);
+	}
+
 	hrtick_update(rq);
 }
 
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
 		flags |= DEQUEUE_SLEEP;
 	}
 
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq);
+	}
+
 	hrtick_update(rq);
 }
 
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
  * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
  */
-static long effective_load(struct task_group *tg, int cpu,
-		long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
 
 	if (!tg->parent)
 		return wl;
 
-	/*
-	 * By not taking the decrease of shares on the other cpu into
-	 * account our error leans towards reducing the affine wakeups.
-	 */
-	if (!wl && sched_feat(ASYM_EFF_LOAD))
-		return wl;
-
 	for_each_sched_entity(se) {
 		long S, rw, s, a, b;
-		long more_w;
-
-		/*
-		 * Instead of using this increment, also add the difference
-		 * between when the shares were last updated and now.
-		 */
-		more_w = se->my_q->load.weight - se->my_q->rq_weight;
-		wl += more_w;
-		wg += more_w;
 
 		S = se->my_q->tg->shares;
-		s = se->my_q->shares;
-		rw = se->my_q->rq_weight;
+		s = se->load.weight;
+		rw = se->my_q->load.weight;
 
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 			sd = tmp;
 	}
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	if (sched_feat(LB_SHARES_UPDATE)) {
-		/*
-		 * Pick the largest domain to update shares over
-		 */
-		tmp = sd;
-		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-			tmp = affine_sd;
-
-		if (tmp) {
-			raw_spin_unlock(&rq->lock);
-			update_shares(tmp);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-#endif
-
 	if (affine_sd) {
 		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
 			return select_idle_sibling(p, cpu);
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 
@@ -3156,8 +3206,6 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (ld_moved)
-		update_shares(sd);
 	return ld_moved;
 }
 
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	int update_next_balance = 0;
 	int need_serialize;
 
+	update_shares(cpu);
+
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
-- 
cgit v1.2.2


From 3d4b47b4b040c9d77dd68104cfc1055d89a55afd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:01 -0800
Subject: sched: Implement on-demand (active) cfs_rq list

Make certain load-balance actions scale per number of active cgroups
instead of the number of existing cgroups.

This makes wakeup/sleep paths more expensive, but is a win for systems
where the vast majority of existing cgroups are idle.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.666535048@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d86544b4151c..0560e72bd732 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -143,6 +143,24 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 	return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->on_list) {
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+
+		cfs_rq->on_list = 1;
+	}
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->on_list) {
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+		cfs_rq->on_list = 0;
+	}
+}
+
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +264,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 	return &cpu_rq(this_cpu)->cfs;
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
@@ -648,7 +674,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 	u64 period = sched_avg_period();
 	u64 now, delta;
@@ -673,6 +699,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 		cfs_rq->load_period /= 2;
 		cfs_rq->load_avg /= 2;
 	}
+
+	if (lb && !cfs_rq->nr_running) {
+		if (cfs_rq->load_avg < (period / 8))
+			list_del_leaf_cfs_rq(cfs_rq);
+	}
 }
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -719,7 +750,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 }
 
@@ -849,7 +880,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -863,6 +894,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
+
+	if (cfs_rq->nr_running == 1)
+		list_add_leaf_cfs_rq(cfs_rq);
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -907,7 +941,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
@@ -1142,7 +1176,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -1172,7 +1206,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
-- 
cgit v1.2.2


From 9e3081ca61147b29f52fddb4f7c6b6b82ea5eb7a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Nov 2010 15:47:02 -0800
Subject: sched: Make tg_shares_up() walk on-demand

Make tg_shares_up() use the active cgroup list, this means we cannot
do a strict bottom-up walk of the hierarchy, but assuming its a very
wide tree with a small number of active groups it should be a win.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.754159484@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0560e72bd732..46ff6587dc16 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2004,6 +2004,60 @@ out:
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int tg_shares_up(struct task_group *tg, int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	unsigned long flags;
+	struct rq *rq;
+	long load_avg;
+
+	if (!tg->se[cpu])
+		return 0;
+
+	rq = cpu_rq(cpu);
+	cfs_rq = tg->cfs_rq[cpu];
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	update_rq_clock(rq);
+	update_cfs_load(cfs_rq, 1);
+
+	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	load_avg -= cfs_rq->load_contribution;
+	atomic_add(load_avg, &tg->load_weight);
+	cfs_rq->load_contribution += load_avg;
+
+	/*
+	 * We need to update shares after updating tg->load_weight in
+	 * order to adjust the weight of groups with long running tasks.
+	 */
+	update_cfs_shares(cfs_rq);
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return 0;
+}
+
+static void update_shares(int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(cpu);
+
+	rcu_read_lock();
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct task_group *tg = cfs_rq->tg;
+
+		do {
+			tg_shares_up(tg, cpu);
+			tg = tg->parent;
+		} while (tg);
+	}
+	rcu_read_unlock();
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -2051,6 +2105,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
-- 
cgit v1.2.2


From f0d7442a5924a802b66eef79b3708f77297bfb35 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:03 -0800
Subject: sched: Fix load corruption from update_cfs_shares()

As part of enqueue_entity both a new entity weight and its contribution to the
queuing cfs_rq / rq are updated.  Since update_cfs_shares will only update the
queueing weights when the entity is on_rq (which in this case it is not yet),
there's a dependency loop here:

update_cfs_shares needs account_entity_enqueue to update cfs_rq->load.weight
account_entity_enqueue needs the updated weight for the queuing cfs_rq load[*]

Fix this and avoid spurious dequeue/enqueues by issuing update_cfs_shares as
if we had accounted the enqueue already.

This was also resulting in rq->load corruption previously.

[*]: this dependency also exists when using the group cfs_rq w/
     update_cfs_shares as the weight of the enqueued entity changes
     without the load being updated.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.844900206@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 46ff6587dc16..d52b97a04e7a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -718,7 +718,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -732,7 +732,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	if (!se)
 		return;
 
-	load = cfs_rq->load.weight;
+	load = cfs_rq->load.weight + weight_delta;
 
 	load_weight = atomic_read(&tg->load_weight);
 	load_weight -= cfs_rq->load_contribution;
@@ -754,7 +754,7 @@ static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -881,8 +881,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
+	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -944,7 +944,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(cfs_rq, 0);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1177,7 +1177,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
 	}
 
 	hrtick_update(rq);
@@ -1207,7 +1207,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
 	}
 
 	hrtick_update(rq);
@@ -2034,7 +2034,7 @@ static int tg_shares_up(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(cfs_rq, 0);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.2


From e33078baa4d30ad1d0e46d1f62b9e5a63a3e6ee3 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:04 -0800
Subject: sched: Fix update_cfs_load() synchronization

Using cfs_rq->nr_running is not sufficient to synchronize update_cfs_load with
the put path since nr_running accounting occurs at deactivation.

It's also not safe to make the removal decision based on load_avg as this fails
with both high periods and low shares.  Resolve this by clipping history after
4 periods without activity.

Note: the above will always occur from update_shares() since in the
last-task-sleep-case that task will still be cfs_rq->curr when update_cfs_load
is called.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.933428187@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d52b97a04e7a..a543a5b202a4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -674,10 +674,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+static void update_cfs_load(struct cfs_rq *cfs_rq)
 {
 	u64 period = sched_avg_period();
 	u64 now, delta;
+	unsigned long load = cfs_rq->load.weight;
 
 	if (!cfs_rq)
 		return;
@@ -685,9 +686,19 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 	now = rq_of(cfs_rq)->clock;
 	delta = now - cfs_rq->load_stamp;
 
+	/* truncate load history at 4 idle periods */
+	if (cfs_rq->load_stamp > cfs_rq->load_last &&
+	    now - cfs_rq->load_last > 4 * period) {
+		cfs_rq->load_period = 0;
+		cfs_rq->load_avg = 0;
+	}
+
 	cfs_rq->load_stamp = now;
 	cfs_rq->load_period += delta;
-	cfs_rq->load_avg += delta * cfs_rq->load.weight;
+	if (load) {
+		cfs_rq->load_last = now;
+		cfs_rq->load_avg += delta * load;
+	}
 
 	while (cfs_rq->load_period > period) {
 		/*
@@ -700,10 +711,8 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
 		cfs_rq->load_avg /= 2;
 	}
 
-	if (lb && !cfs_rq->nr_running) {
-		if (cfs_rq->load_avg < (period / 8))
-			list_del_leaf_cfs_rq(cfs_rq);
-	}
+	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+		list_del_leaf_cfs_rq(cfs_rq);
 }
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -750,7 +759,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -880,7 +889,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq, 0);
+	update_cfs_load(cfs_rq);
 	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -941,7 +950,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq, 0);
+	update_cfs_load(cfs_rq);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq, 0);
@@ -1176,7 +1185,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq, 0);
+		update_cfs_load(cfs_rq);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -1206,7 +1215,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq, 0);
+		update_cfs_load(cfs_rq);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -2023,7 +2032,7 @@ static int tg_shares_up(struct task_group *tg, int cpu)
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq, 1);
+	update_cfs_load(cfs_rq);
 
 	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 	load_avg -= cfs_rq->load_contribution;
-- 
cgit v1.2.2


From 67e86250f8ea7b8f7da53ac25ea73c6bd71f5cd9 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:05 -0800
Subject: sched: Introduce hierarchal order on shares update list

Avoid duplicate shares update calls by ensuring children always appear before
parents in rq->leaf_cfs_rq_list.

This allows us to do a single in-order traversal for update_shares().

Since we always enqueue in bottom-up order this reduces to 2 cases:

1) Our parent is already in the list, e.g.

   root
     \
      b
      /\
      c d* (root->b->c already enqueued)

Since d's parent is enqueued we push it to the head of the list, implicitly ahead of b.

2) Our parent does not appear in the list (or we have no parent)

In this case we enqueue to the tail of the list, if our parent is subsequently enqueued
(bottom-up) it will appear to our right by the same rule.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.022488865@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a543a5b202a4..b320753aa6c9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -146,8 +146,20 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+		/*
+		 * Ensure we either appear before our parent (if already
+		 * enqueued) or force our parent to appear after us when it is
+		 * enqueued.  The fact that we always enqueue bottom-up
+		 * reduces this to two cases.
+		 */
+		if (cfs_rq->tg->parent &&
+		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+		} else {
+			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+		}
 
 		cfs_rq->on_list = 1;
 	}
@@ -2016,7 +2028,7 @@ out:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int tg_shares_up(struct task_group *tg, int cpu)
+static int update_shares_cpu(struct task_group *tg, int cpu)
 {
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
@@ -2056,14 +2068,8 @@ static void update_shares(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct task_group *tg = cfs_rq->tg;
-
-		do {
-			tg_shares_up(tg, cpu);
-			tg = tg->parent;
-		} while (tg);
-	}
+	for_each_leaf_cfs_rq(rq, cfs_rq)
+		update_shares_cpu(cfs_rq->tg, cpu);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.2


From a7a4f8a752ec734b2eab904fc863d5dc873de338 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:06 -0800
Subject: sched: Add sysctl_sched_shares_window

Introduce a new sysctl for the shares window and disambiguate it from
sched_time_avg.

A 10ms window appears to be a good compromise between accuracy and performance.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.112173964@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b320753aa6c9..6c84439ce987 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -688,7 +695,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 static void update_cfs_load(struct cfs_rq *cfs_rq)
 {
-	u64 period = sched_avg_period();
+	u64 period = sysctl_sched_shares_window;
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-- 
cgit v1.2.2


From c66eaf619c0c7937e9ded160ae83b5a7a6b19b56 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:07 -0800
Subject: sched: Update shares on idle_balance

Since shares updates are no longer expensive and effectively local, update them
at idle_balance().  This allows us to more quickly redistribute shares to
another cpu when our load becomes idle.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.204191702@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c84439ce987..33f941dcf88c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3343,6 +3343,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	 */
 	raw_spin_unlock(&this_rq->lock);
 
+	update_shares(this_cpu);
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		int balance = 1;
-- 
cgit v1.2.2


From 3b3d190ec3683d568fd2ebaead5e1ec7f97b6e37 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:08 -0800
Subject: sched: Implement demand based update_cfs_load()

When the system is busy, dilation of rq->next_balance makes lb->update_shares()
insufficiently frequent for threads which don't sleep (no dequeue/enqueue
updates).  Adjust for this by making demand based updates based on the
accumulation of execution time sufficient to wrap our averaging window.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.291159744@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 33f941dcf88c..e7e2f08e6d01 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -539,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -558,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->load_unacc_exec_time += delta_exec;
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq);
+		update_cfs_shares(cfs_rq, 0);
+	}
+#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -713,6 +724,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 	}
 
 	cfs_rq->load_stamp = now;
+	cfs_rq->load_unacc_exec_time = 0;
 	cfs_rq->load_period += delta;
 	if (load) {
 		cfs_rq->load_last = now;
-- 
cgit v1.2.2


From d6b5591829bd348a5fbe1c428d28dea00621cdba Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Mon, 15 Nov 2010 15:47:09 -0800
Subject: sched: Allow update_cfs_load() to update global load

Refactor the global load updates from update_shares_cpu() so that
update_cfs_load() can update global load when it is more than ~10%
out of sync.

The new global_load parameter allows us to force an update, regardless of
the error factor so that we can synchronize w/ update_shares().

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234938.377473595@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e7e2f08e6d01..390ce30ff2d0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -539,7 +539,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
 static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
 
 /*
@@ -565,7 +565,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 #endif
@@ -704,7 +704,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+					    int global_update)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long load_avg;
+
+	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+	load_avg -= cfs_rq->load_contribution;
+
+	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+		atomic_add(load_avg, &tg->load_weight);
+		cfs_rq->load_contribution += load_avg;
+	}
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 	u64 period = sysctl_sched_shares_window;
 	u64 now, delta;
@@ -731,6 +746,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
 		cfs_rq->load_avg += delta * load;
 	}
 
+	/* consider updating load contribution on each fold or truncate */
+	if (global_update || cfs_rq->load_period > period
+	    || !cfs_rq->load_period)
+		update_cfs_rq_load_contribution(cfs_rq, global_update);
+
 	while (cfs_rq->load_period > period) {
 		/*
 		 * Inline assembly required to prevent the compiler
@@ -790,7 +810,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
@@ -920,7 +940,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -981,7 +1001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq);
+	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq, 0);
@@ -1216,7 +1236,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -1246,7 +1266,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-		update_cfs_load(cfs_rq);
+		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq, 0);
 	}
 
@@ -2052,7 +2072,6 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
 	struct rq *rq;
-	long load_avg;
 
 	if (!tg->se[cpu])
 		return 0;
@@ -2063,12 +2082,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq);
-
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-	load_avg -= cfs_rq->load_contribution;
-	atomic_add(load_avg, &tg->load_weight);
-	cfs_rq->load_contribution += load_avg;
+	update_cfs_load(cfs_rq, 1);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
-- 
cgit v1.2.2


From 70caf8a6c13c2279b35f2ad6b644815533d6c476 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 20 Nov 2010 00:53:51 +0100
Subject: sched: Fix UP build breakage

The recent cgroup-scheduling rework caused a UP build problem.

Cc: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 390ce30ff2d0..82fd884b4e33 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -562,7 +562,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-- 
cgit v1.2.2


From 43365bd7ff37979d2afdccbe953299ed64a4649b Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 15 Dec 2010 19:10:17 -0800
Subject: sched: Move periodic share updates to entity_tick()

Long running entities that do not block (dequeue) require periodic updates to
maintain accurate share values.  (Note: group entities with several threads are
quite likely to be non-blocking in many circumstances).

By virtue of being long-running however, we will see entity ticks (otherwise
the required update occurs in dequeue/put and we are done).  Thus we can move
the detection (and associated work) for these updates into the periodic path.

This restores the 'atomicity' of update_curr() with respect to accounting.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101216031038.067028969@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c88671718bc9..16ee398d8a4e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -564,10 +564,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->load_unacc_exec_time += delta_exec;
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
-	}
 #endif
 }
 
@@ -809,6 +805,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq, 0);
+		update_cfs_shares(cfs_rq, 0);
+	}
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
@@ -817,6 +821,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 }
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1133,6 +1141,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_curr(cfs_rq);
 
+	/*
+	 * Update share accounting for long-running entities.
+	 */
+	update_entity_shares_tick(cfs_rq);
+
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
-- 
cgit v1.2.2


From 19e5eebb8eaa5ca3ff8aa18cb57ccb7a9f67277d Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 15 Dec 2010 19:10:18 -0800
Subject: sched: Fix interactivity bug by charging unaccounted run-time on
 entity re-weight

Mike Galbraith reported poor interactivity[*] when the new shares distribution
code was combined with autogroups.

The root cause turns out to be a mis-ordering of accounting accrued execution
time and shares updates.  Since update_curr() is issued hierarchically,
updating the parent entity weights to reflect child enqueue/dequeue results in
the parent's unaccounted execution time then being accrued (vs vruntime) at the
new weight as opposed to the weight present at accumulation.

While this doesn't have much effect on processes with timeslices that cross a
tick, it is particularly problematic for an interactive process (e.g. Xorg)
which incurs many (tiny) timeslices.  In this scenario almost all updates are
at dequeue which can result in significant fairness perturbation (especially if
it is the only thread, resulting in potential {tg->shares, MIN_SHARES}
transitions).

Correct this by ensuring unaccounted time is accumulated prior to manipulating
an entity's weight.

[*] http://xkcd.com/619/ is perversely Nostradamian here.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20101216031038.159704378@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 16ee398d8a4e..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -765,8 +765,12 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	if (se->on_rq)
+	if (se->on_rq) {
+		/* commit outstanding execution time */
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq);
 		account_entity_dequeue(cfs_rq, se);
+	}
 
 	update_load_set(&se->load, weight);
 
-- 
cgit v1.2.2


From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 14 Jan 2011 17:57:50 -0800
Subject: sched: Update effective_load() to use global share weights

Previously effective_load would approximate the global load weight present on
a group taking advantage of:

entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided
by tg_shares_up.

This worked (approximately) for an 'empty' (at tg level) cpu since we would
place boost load representative of what a newly woken task would receive.

However, now that load is instantaneously updated this assumption is no longer
true and the load calculation is rather incorrect in this case.

Fix this (and improve the general case) by re-writing effective_load to take
advantage of the new shares distribution code.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110115015817.069769529@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..414145cf5344 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 		return wl;
 
 	for_each_sched_entity(se) {
-		long S, rw, s, a, b;
+		long lw, w;
 
-		S = se->my_q->tg->shares;
-		s = se->load.weight;
-		rw = se->my_q->load.weight;
+		tg = se->my_q->tg;
+		w = se->my_q->load.weight;
 
-		a = S*(rw + wl);
-		b = S*rw + s*wg;
+		/* use this cpu's instantaneous contribution */
+		lw = atomic_read(&tg->load_weight);
+		lw -= se->my_q->load_contribution;
+		lw += w + wg;
 
-		wl = s*(a-b);
+		wl += w;
 
-		if (likely(b))
-			wl /= b;
+		if (lw > 0 && wl < lw)
+			wl = (wl * tg->shares) / lw;
+		else
+			wl = tg->shares;
 
-		/*
-		 * Assume the group is already running and will
-		 * thus already be accounted for in the weight.
-		 *
-		 * That is, moving shares between CPUs, does not
-		 * alter the group weight.
-		 */
+		/* zero point is MIN_SHARES */
+		if (wl < MIN_SHARES)
+			wl = MIN_SHARES;
+		wl -= se->load.weight;
 		wg = 0;
 	}
 
-- 
cgit v1.2.2


From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 5 Jan 2011 05:41:17 +0100
Subject: sched: Fix signed unsigned comparison in check_preempt_tick()

Signed unsigned comparison may lead to superfluous resched if leftmost
is right of the current task, wasting a few cycles, and inadvertently
_lengthening_ the current task's slice.

Reported-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294202477.9384.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 414145cf5344..77e9166d7bbf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		struct sched_entity *se = __pick_next_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
+		if (delta < 0)
+			return;
+
 		if (delta > ideal_runtime)
 			resched_task(rq_of(cfs_rq)->curr);
 	}
-- 
cgit v1.2.2


From 3ff6dcac735704824c1dff64dc6863c390d364cc Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Mon, 24 Jan 2011 15:33:52 +0800
Subject: sched: Fix poor interactivity on UP systems due to group scheduler
 nice tune bug

Michael Witten and Christian Kujau reported that the autogroup
scheduling feature hurts interactivity on their UP systems.

It turns out that this is an older bug in the group scheduling code,
and the wider appeal provided by the autogroup feature exposed it
more prominently.

When on UP with FAIR_GROUP_SCHED enabled, tune shares
only affect tg->shares, but is not reflected in
tg->se->load. The reason is that update_cfs_shares()
does nothing on UP.

So introduce update_cfs_shares() for UP && FAIR_GROUP_SCHED.

This issue was found when enable autogroup scheduling was enabled,
but it is an older bug that also exists on cgroup.cpu on UP.

Reported-and-Tested-by: Michael Witten <mfwitten@gmail.com>
Reported-and-Tested-by: Christian Kujau <christian@nerdbynature.de>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110124073352.GA24186@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 78 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 77e9166d7bbf..354769979c02 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->nr_running--;
 }
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 					    int global_update)
 {
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+				long weight_delta)
+{
+	long load_weight, load, shares;
+
+	load = cfs_rq->load.weight + weight_delta;
+
+	load_weight = atomic_read(&tg->load_weight);
+	load_weight -= cfs_rq->load_contribution;
+	load_weight += load;
+
+	shares = (tg->shares * load);
+	if (load_weight)
+		shares /= load_weight;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	if (shares > tg->shares)
+		shares = tg->shares;
+
+	return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq, 0);
+		update_cfs_shares(cfs_rq, 0);
+	}
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+				long weight_delta)
+{
+	return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
@@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
-	long load_weight, load, shares;
+	long shares;
 
 	if (!cfs_rq)
 		return;
@@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
 	if (!se)
 		return;
-
-	load = cfs_rq->load.weight + weight_delta;
-
-	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
-	load_weight += load;
-
-	shares = (tg->shares * load);
-	if (load_weight)
-		shares /= load_weight;
-
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	if (shares > tg->shares)
-		shares = tg->shares;
+#ifndef CONFIG_SMP
+	if (likely(se->load.weight == tg->shares))
+		return;
+#endif
+	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
-	}
-}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
-- 
cgit v1.2.2


From e37b6a7b27b400c3aa488db8c6629a05095bc79c Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:44:59 -0800
Subject: sched: Fix sign under-flows in wake_affine

While care is taken around the zero-point in effective_load to not exceed
the instantaneous rq->weight, it's still possible (e.g. using wake_idx != 0)
for (load + effective_load) to underflow.

In this case the comparing the unsigned values can result in incorrect balanced
decisions.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.734245014@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 354769979c02..ccecfec02a70 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1432,7 +1432,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	unsigned long this_load, load;
+	s64 this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
 	struct task_group *tg;
@@ -1471,8 +1471,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	if (this_load) {
-		unsigned long this_eff_load, prev_eff_load;
+	if (this_load > 0) {
+		s64 this_eff_load, prev_eff_load;
 
 		this_eff_load = 100;
 		this_eff_load *= power_of(prev_cpu);
-- 
cgit v1.2.2


From b815f1963e47b9b69bb17e0588bd5af5b1114ae0 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:00 -0800
Subject: sched: Fix/remove redundant cfs_rq checks

Since updates are against an entity's queuing cfs_rq it's not possible to
enter update_cfs_{shares,load} with a NULL cfs_rq.  (Indeed, update_cfs_load
would crash prior to the check if we did anyway since we load is examined
during the initializers).

Also, in the update_cfs_load case there's no point
in maintaining averages for rq->cfs_rq since we don't perform shares
distribution at that level -- NULL check is replaced accordingly.

Thanks to Dan Carpenter for pointing out the deference before NULL check.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.825284940@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ccecfec02a70..1997383ba4d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -722,7 +722,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-	if (!cfs_rq)
+	if (cfs_rq->tg == &root_task_group)
 		return;
 
 	now = rq_of(cfs_rq)->clock;
@@ -830,9 +830,6 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	struct sched_entity *se;
 	long shares;
 
-	if (!cfs_rq)
-		return;
-
 	tg = cfs_rq->tg;
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
 	if (!se)
-- 
cgit v1.2.2


From 05ca62c6ca17f39b88fa956d5ebc1fa6e93ad5e3 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:02 -0800
Subject: sched: Use rq->clock_task instead of rq->clock for correctly
 maintaining load averages

The delta in clock_task is a more fair attribution of how much time a tg has
been contributing load to the current cpu.

While not really important it also means we're more in sync (by magnitude)
with respect to periodic updates (since __update_curr deltas are clock_task
based).

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.007092349@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1997383ba4d6..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -725,7 +725,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
-	now = rq_of(cfs_rq)->clock;
+	now = rq_of(cfs_rq)->clock_task;
 	delta = now - cfs_rq->load_stamp;
 
 	/* truncate load history at 4 idle periods */
-- 
cgit v1.2.2


From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:01 -0800
Subject: sched: Simplify update_cfs_shares parameters

Re-visiting this: Since update_cfs_shares will now only ever re-weight an
entity that is a relative parent of the current entity in enqueue_entity; we
can safely issue the account_entity_enqueue relative to that cfs_rq and avoid
the requirement for special handling of the enqueue case in update_cfs_shares.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.915214637@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..0c550c841eee 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
  * Update the current task's runtime statistics. Skip current tasks that
@@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long load_weight, load, shares;
 
-	load = cfs_rq->load.weight + weight_delta;
+	load = cfs_rq->load.weight;
 
 	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
 	load_weight += load;
+	load_weight -= cfs_rq->load_contribution;
 
 	shares = (tg->shares * load);
 	if (load_weight)
@@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 }
 # else /* CONFIG_SMP */
@@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
@@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.2


From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:03 -0800
Subject: sched: Avoid expensive initial update_cfs_load()

Since cfs->{load_stamp,load_last} are zero-initalized the initial load update
will consider the delta to be 'since the beginning of time'.

This results in a lot of pointless divisions to bring this large period to be
within the sysctl_sched_shares_window.

Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this
allows for an initial load_stamp > load_last which then lets standard idle
truncation proceed.

We avoid spinning (and slightly improve consistency) by fixing delta to be
[period - 1] in this path resulting in a slightly more predictable shares ramp.
(Previously the amount of idle time preserved by the overflow would range between
[period/2,period-1].)

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c550c841eee..4cbc9121094c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	    now - cfs_rq->load_last > 4 * period) {
 		cfs_rq->load_period = 0;
 		cfs_rq->load_avg = 0;
+		delta = period - 1;
 	}
 
 	cfs_rq->load_stamp = now;
-- 
cgit v1.2.2


From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 17 Jan 2011 17:03:27 +0100
Subject: sched: Fix switch_from_fair()

When a task is taken out of the fair class we must ensure the vruntime
is properly normalized because when we put it back in it will assume
to be normalized.

The case that goes wrong is when changing away from the fair class
while sleeping. Sleeping tasks have non-normalized vruntime in order
to make sleeper-fairness work. So treat the switch away from fair as a
wakeup and preserve the relative vruntime.

Also update sysrq-n to call the ->switch_{to,from} methods.

Reported-by: Onkalo Samu <samu.p.onkalo@nokia.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4cbc9121094c..55040f3938d8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p)
  * Priority of the task has changed. Check to see if we preempt
  * the current task.
  */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
-	if (running) {
+	if (rq->curr == p) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Ensure the task's vruntime is normalized, so that when its
+	 * switched back to the fair class the enqueue_entity(.flags=0) will
+	 * do the right thing.
+	 *
+	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it was !on_rq, then only when
+	 * the task is sleeping will it still have non-normalized vruntime.
+	 */
+	if (!se->on_rq && p->state != TASK_RUNNING) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+}
+
 /*
  * We switched to the sched_fair class.
  */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * We were most likely switched from sched_rt, so
 	 * kick off the schedule if running, otherwise just see
 	 * if we can still preempt the current task.
 	 */
-	if (running)
+	if (rq->curr == p)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
@@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = {
 	.task_fork		= task_fork_fair,
 
 	.prio_changed		= prio_changed_fair,
+	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 
 	.get_rr_interval	= get_rr_interval_fair,
-- 
cgit v1.2.2


From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:47:15 -0500
Subject: sched: Check the right ->nr_running in yield_task_fair()

With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq.
Yielding to a task from another cfs_rq may be worthwhile, since
a process calling yield typically cannot use the CPU right now.

Therefor, we want to check the per-cpu nr_running, not the
cgroup local one.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 55040f3938d8..4de9905228c4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Are we the only task in the tree?
 	 */
-	if (unlikely(cfs_rq->nr_running == 1))
+	if (unlikely(rq->nr_running == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
-- 
cgit v1.2.2


From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:48:37 -0500
Subject: sched: Limit the scope of clear_buddies

The clear_buddies function does not seem to play well with the concept
of hierarchical runqueues.  In the following tree, task groups are
represented by 'G', tasks by 'T', next by 'n' and last by 'l'.

     (nl)
    /    \
   G(nl)  G
   / \     \
 T(l) T(n)  T

This situation can arise when a task is woken up T(n), and the previously
running task T(l) is marked last.

When clear_buddies is called from either T(l) or T(n), the next and last
buddies of the group G(nl) will be cleared.  This is not the desired
result, since we would like to be able to find the other type of buddy
in many cases.

This especially a worry when implementing yield_task_fair through the
buddy system.

The fix is simple: only clear the buddy type that the task itself
is indicated to be.  As an added bonus, we stop walking up the tree
when the buddy has already been cleared or pointed elsewhere.

Signed-off-by: Rik van Riel <riel@redhat.coM>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4de9905228c4..a785e08202cf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		list_add_leaf_cfs_rq(cfs_rq);
 }
 
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
 {
-	if (!se || cfs_rq->last == se)
-		cfs_rq->last = NULL;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->last == se)
+			cfs_rq->last = NULL;
+		else
+			break;
+	}
+}
 
-	if (!se || cfs_rq->next == se)
-		cfs_rq->next = NULL;
+static void __clear_buddies_next(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->next == se)
+			cfs_rq->next = NULL;
+		else
+			break;
+	}
 }
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	for_each_sched_entity(se)
-		__clear_buddies(cfs_rq_of(se), se);
+	if (cfs_rq->last == se)
+		__clear_buddies_last(se);
+
+	if (cfs_rq->next == se)
+		__clear_buddies_next(se);
 }
 
 static void
-- 
cgit v1.2.2


From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:51:03 -0500
Subject: sched: Use a buddy to implement yield_task_fair()

Use the buddy mechanism to implement yield_task_fair.  This
allows us to skip onto the next highest priority se at every
level in the CFS tree, unless doing so would introduce gross
unfairness in CPU time distribution.

We order the buddy selection in pick_next_entity to check
yield first, then last, then next.  We need next to be able
to override yield, because it is possible for the "next" and
"yield" task to be different processen in the same sub-tree
of the CFS tree.  When they are, we need to go into that
sub-tree regardless of the "yield" hint, and pick the correct
entity once we get to the right level.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 148 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 60 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a785e08202cf..c0fbeb992833 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8;
  */
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 
-/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = cfs_rq->rb_leftmost;
 
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 	return rb_entry(left, struct sched_entity, run_node);
 }
 
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+	struct rb_node *next = rb_next(&se->run_node);
+
+	if (!next)
+		return NULL;
+
+	return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
-#ifdef CONFIG_SCHED_DEBUG
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se)
 	}
 }
 
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->skip == se)
+			cfs_rq->skip = NULL;
+		else
+			break;
+	}
+}
+
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->last == se)
@@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	if (cfs_rq->next == se)
 		__clear_buddies_next(se);
+
+	if (cfs_rq->skip == se)
+		__clear_buddies_skip(se);
 }
 
 static void
@@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (cfs_rq->nr_running > 1) {
-		struct sched_entity *se = __pick_next_entity(cfs_rq);
+		struct sched_entity *se = __pick_first_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
 		if (delta < 0)
@@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *left = se;
 
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-		se = cfs_rq->next;
+	/*
+	 * Avoid running the skip buddy, if running something else can
+	 * be done without getting too unfair.
+	 */
+	if (cfs_rq->skip == se) {
+		struct sched_entity *second = __pick_next_entity(se);
+		if (second && wakeup_preempt_entity(second, left) < 1)
+			se = second;
+	}
 
 	/*
 	 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 		se = cfs_rq->last;
 
+	/*
+	 * Someone really wants this to run. If it's not unfair, run it.
+	 */
+	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+		se = cfs_rq->next;
+
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	hrtick_update(rq);
 }
 
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *rightmost, *se = &curr->se;
-
-	/*
-	 * Are we the only task in the tree?
-	 */
-	if (unlikely(rq->nr_running == 1))
-		return;
-
-	clear_buddies(cfs_rq, se);
-
-	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		update_rq_clock(rq);
-		/*
-		 * Update run-time statistics of the 'current'.
-		 */
-		update_curr(cfs_rq);
-
-		return;
-	}
-	/*
-	 * Find the rightmost entry in the rbtree:
-	 */
-	rightmost = __pick_last_entity(cfs_rq);
-	/*
-	 * Already in the rightmost position?
-	 */
-	if (unlikely(!rightmost || entity_before(rightmost, se)))
-		return;
-
-	/*
-	 * Minimally necessary key value to be last in the tree:
-	 * Upon rescheduling, sched_class::put_prev_task() will place
-	 * 'current' within the tree based on its new key value.
-	 */
-	se->vruntime = rightmost->vruntime + 1;
-}
-
 #ifdef CONFIG_SMP
 
 static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
 	}
 }
 
+static void set_skip_buddy(struct sched_entity *se)
+{
+	if (likely(task_of(se)->policy != SCHED_IDLE)) {
+		for_each_sched_entity(se)
+			cfs_rq_of(se)->skip = se;
+	}
+}
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 	}
 }
 
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	struct sched_entity *se = &curr->se;
+
+	/*
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(rq->nr_running == 1))
+		return;
+
+	clear_buddies(cfs_rq, se);
+
+	if (curr->policy != SCHED_BATCH) {
+		update_rq_clock(rq);
+		/*
+		 * Update run-time statistics of the 'current'.
+		 */
+		update_curr(cfs_rq);
+	}
+
+	set_skip_buddy(se);
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
-- 
cgit v1.2.2


From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 1 Feb 2011 09:50:51 -0500
Subject: sched: Add yield_to(task, preempt) functionality

Currently only implemented for fair class tasks.

Add a yield_to_task method() to the fair scheduling class. allowing the
caller of yield_to() to accelerate another thread in it's thread group,
task group.

Implemented via a scheduler hint, using cfs_rq->next to encourage the
target being selected.  We can rely on pick_next_entity to keep things
fair, so noone can accelerate a thread that has already used its fair
share of CPU time.

This also means callers should only call yield_to when they really
mean it.  Calling it too often can result in the scheduler just
ignoring the hint.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c0fbeb992833..027024694043 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq)
 	set_skip_buddy(se);
 }
 
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+	struct sched_entity *se = &p->se;
+
+	if (!se->on_rq)
+		return false;
+
+	/* Tell the scheduler that we'd really like pse to run next. */
+	set_next_buddy(se);
+
+	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
+	if (preempt)
+		resched_task(rq->curr);
+
+	yield_task_fair(rq);
+
+	return true;
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
@@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
+	.yield_to_task		= yield_to_task_fair,
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
-- 
cgit v1.2.2


From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 14 Feb 2011 14:38:50 -0800
Subject: sched: Wholesale removal of sd_idle logic

sd_idle logic was introduced way back in 2005 (commit 5969fe06),
as an HT optimization.

As per the discussion in the thread here:

  lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1
  https://patchwork.kernel.org/patch/532501/

The capacity based logic in the load balancer right now handles this
in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle
does not seem to bring any additional benefits. sd_idle logic also has
some bugs that has performance impact. Here is the patch that removes
the sd_idle logic altogether.

Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle
logic.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Acked-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 53 +++++++++++------------------------------------------
 1 file changed, 11 insertions(+), 42 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 027024694043..d384e739ea95 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
  * @local_group: Does group contain this_cpu.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
@@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
 			struct sched_group *group, int this_cpu,
-			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			enum cpu_idle_type idle, int load_idx,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
@@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (*sd_idle && rq->nr_running)
-			*sd_idle = 0;
-
 		/* Bias balancing toward cpus of our domain */
 		if (local_group) {
 			if (idle_cpu(i) && !first_idle_cpu) {
@@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-			enum cpu_idle_type idle, int *sd_idle,
-			const struct cpumask *cpus, int *balance,
-			struct sd_lb_stats *sds)
+			enum cpu_idle_type idle, const struct cpumask *cpus,
+			int *balance, struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *sg = sd->groups;
@@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
 		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && !(*balance))
@@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  * @imbalance: Variable which stores amount of weighted load which should
  *		be moved to restore balance/put a group to idle.
  * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
  * @cpus: The set of CPUs under consideration for load-balancing.
  * @balance: Pointer to a variable indicating if this_cpu
  *	is the appropriate cpu to perform load balancing at this_level.
@@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
+		   const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
 
@@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
-	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-					balance, &sds);
+	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
 	/* Cases where imbalance does not exist from POV of this_cpu */
 	/* 1) this_cpu is not the appropriate cpu to perform load balancing
@@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
 			       int busiest_cpu, int this_cpu)
 {
 	if (idle == CPU_NEWLY_IDLE) {
@@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
 		 * move_tasks() will succeed.  ld_moved will be true and this
 		 * active balance code will not be triggered.
 		 */
-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-			return 0;
-
 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
 			return 0;
 	}
@@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int ld_moved, all_pinned = 0, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
@@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
-	/*
-	 * When power savings policy is enabled for the parent domain, idle
-	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
-	 */
-	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		sd_idle = 1;
-
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
 				   cpus, balance);
 
 	if (*balance == 0)
@@ -3392,8 +3370,7 @@ redo:
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
-					this_cpu)) {
+		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
@@ -3448,10 +3425,6 @@ redo:
 			sd->balance_interval *= 2;
 	}
 
-	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
 	goto out;
 
 out_balanced:
@@ -3465,11 +3438,7 @@ out_one_pinned:
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
+	ld_moved = 0;
 out:
 	return ld_moved;
 }
-- 
cgit v1.2.2


From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:52:53 +0100
Subject: sched: Clean up remnants of sd_idle

With the wholesale removal of the sd_idle SMT logic we can clean up
some more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d384e739ea95..cd18600a8a63 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-	/*
-	 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
-	 * And to check for busy balance use !idle_cpu instead of
-	 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
-	 * even when they are idle.
-	 */
-	if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
-		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-			goto out_balanced;
-	} else {
+	if (idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group load doesn't
 		 * have more tasks than the number of available cpu's and
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
-		if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
 		    sds.busiest_nr_running <= sds.busiest_group_weight)
 			goto out_balanced;
+	} else {
+		/*
+		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+		 * imbalance_pct to be conservative.
+		 */
+		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+			goto out_balanced;
 	}
 
 force_balance:
@@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
-				 * longer idle, or one of our SMT siblings is
-				 * not idle.
+				 * longer idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
-- 
cgit v1.2.2


From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:55:32 +0100
Subject: sched: Clean up some f_b_g() comments

The existing comment tends to grow state (as it already has), split it
up and place it near the actual tests.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd18600a8a63..03496ebc4553 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 */
 	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
-	/* Cases where imbalance does not exist from POV of this_cpu */
-	/* 1) this_cpu is not the appropriate cpu to perform load balancing
-	 *    at this level.
-	 * 2) There is no busy sibling group to pull from.
-	 * 3) This group is the busiest group.
-	 * 4) This group is more busy than the avg busieness at this
-	 *    sched_domain.
-	 * 5) The imbalance is within the specified limit.
-	 *
-	 * Note: when doing newidle balance, if the local group has excess
-	 * capacity (i.e. nr_running < group_capacity) and the busiest group
-	 * does not have any capacity, we force a load balance to pull tasks
-	 * to the local group. In this case, we skip past checks 3, 4 and 5.
+	/*
+	 * this_cpu is not the appropriate cpu to perform load balancing at
+	 * this level.
 	 */
 	if (!(*balance))
 		goto ret;
@@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	    check_asym_packing(sd, &sds, this_cpu, imbalance))
 		return sds.busiest;
 
+	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	/*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
 		goto force_balance;
 
+	/*
+	 * If the local group is more busy than the selected busiest group
+	 * don't try and pull any tasks.
+	 */
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
+	/*
+	 * Don't pull any tasks if this group is already above the domain
+	 * average load.
+	 */
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-- 
cgit v1.2.2


From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:56:47 +0100
Subject: sched: Fix the group_imb logic

On a 2*6*2 machine something like:

 taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done'

_should_ result in 9 busy CPUs, each running 1 task.

However it didn't quite work reliably, most of the time one cpu of the
second socket (6-11) would be idle and one cpu of the first socket
(0-5) would have two tasks on it.

The group_imb logic is supposed to deal with this and detect when a
particular group is imbalanced (like in our case, 0-2 are idle but 3-5
will have 4 tasks on it).

The detection phase needed a bit of a tweak as it was too weak and
required more than 2 avg weight tasks difference between idle and busy
cpus in the group which won't trigger for our test-case. So cure that
to be one or more avg task weight difference between cpus.

Once the detection phase worked, it was then defeated by the f_b_g()
tests trying to avoid ping-pongs. In particular, this_load >= max_load
triggered because the pulling cpu (the (first) idle cpu in on the
second socket, say 6) would find this_load to be 5 and max_load to be
4 (there'd be 5 tasks running on our socket and only 4 on the other
socket).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03496ebc4553..3a88dee165c0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
-	 * than the average weight of two tasks.
+	 * than the average weight of a task.
 	 *
 	 * APZ: with cgroup the avg task weight can vary wildly and
 	 *      might not be a suitable number - should we keep a
@@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (sgs->sum_nr_running)
 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	/*
+	 * If the busiest group is imbalanced the below checks don't
+	 * work because they assumes all things are equal, which typically
+	 * isn't true due to cpus_allowed constraints and the like.
+	 */
+	if (sds.group_imb)
+		goto force_balance;
+
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
-- 
cgit v1.2.2


From a2f5c9ab79f78e8b91ac993e0543d65b661dd19b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Tue, 22 Feb 2011 13:04:33 -0800
Subject: sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks

Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and
ensure idle tasks don't preempt idle tasks) so the non-interactive,
but still important, SCHED_BATCH tasks will run in favor of the very
low priority SCHED_IDLE tasks.

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Richard Purdie <richard.purdie@linuxfoundation.org>
LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3a88dee165c0..1438e13cf8be 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (test_tsk_need_resched(curr))
 		return;
 
+	/* Idle tasks are by definition preempted by non-idle tasks. */
+	if (unlikely(curr->policy == SCHED_IDLE) &&
+	    likely(p->policy != SCHED_IDLE))
+		goto preempt;
+
 	/*
-	 * Batch and idle tasks do not preempt (their preemption is driven by
-	 * the tick):
+	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
+	 * is driven by the tick):
 	 */
 	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
-	/* Idle tasks are by definition preempted by everybody. */
-	if (unlikely(curr->policy == SCHED_IDLE))
-		goto preempt;
 
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
-- 
cgit v1.2.2


From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 1 Mar 2011 16:28:21 -0800
Subject: sched: Resched proper CPU on yield_to()

yield_to_task_fair() has code to resched the CPU of yielding task when the
intention is to resched the CPU of the task that is being yielded to.

Change here fixes the problem and also makes the resched conditional on
rq != p_rq.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1438e13cf8be..3f7ec9e27ee1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 	/* Tell the scheduler that we'd really like pse to run next. */
 	set_next_buddy(se);
 
-	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
-	if (preempt)
-		resched_task(rq->curr);
-
 	yield_task_fair(rq);
 
 	return true;
-- 
cgit v1.2.2


From 3436ae1298cb22d722a6520fc97f112dd767a9e1 Mon Sep 17 00:00:00 2001
From: Sisir Koppaka <sisir.koppaka@gmail.com>
Date: Sat, 26 Mar 2011 18:22:55 +0530
Subject: sched: Fix rebalance interval calculation

The interval for checking scheduling domains if they are due to be
balanced currently depends on boot state NR_CPUS, which may not
accurately reflect the number of online CPUs at the time of check.

Thus replace NR_CPUS with num_online_cpus().

 (ed: Should only affect those who set NR_CPUS really high, such as 4096
      or so :-)

Signed-off-by: Sisir Koppaka <sisir.koppaka@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <AANLkTikqHWid2Q93F5U5Qw5snJH8C5PXoa7J6=6hYO94@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3f7ec9e27ee1..c7ec5c8e7b44 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
 
 #include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -3850,8 +3851,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
-		if (interval > HZ*NR_CPUS/10)
-			interval = HZ*NR_CPUS/10;
+		if (interval > HZ*num_online_cpus()/10)
+			interval = HZ*num_online_cpus()/10;
 
 		need_serialize = sd->flags & SD_SERIALIZE;
 
-- 
cgit v1.2.2


From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 30 Mar 2011 22:57:33 -0300
Subject: Fix common misspellings

Fixes generated by 'codespell' and manually reviewed.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3f7ec9e27ee1..3cb7f07887a1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3061,7 +3061,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 	/*
 	 * if *imbalance is less than the average load per runnable task
-	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * there is no guarantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-- 
cgit v1.2.2


From 49c022e657fbe661460d191fbe776a387132e2b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 10:14:25 +0200
Subject: sched: Clean up rebalance_domains() load-balance interval calculation

Instead of the possible multiple-evaluation of num_online_cpus()
in rebalance_domains() that Linus reported, avoid it altogether
in the normal case since it's implemented with a Hamming weight
function over a cpu bitmask which can be darn expensive for those
with big iron.

This also makes it cleaner, smaller and documents the code.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1301991265.2225.12.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c7ec5c8e7b44..80ecd09452e0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3820,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick)
 
 static DEFINE_SPINLOCK(balancing);
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+	max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -3849,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
-		if (unlikely(!interval))
-			interval = 1;
-		if (interval > HZ*num_online_cpus()/10)
-			interval = HZ*num_online_cpus()/10;
+		interval = clamp(interval, 1UL, max_load_balance_interval);
 
 		need_serialize = sd->flags & SD_SERIALIZE;
 
-- 
cgit v1.2.2


From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Thu, 7 Apr 2011 17:23:22 -0700
Subject: sched: Fix sched-domain avg_load calculation

In function find_busiest_group(), the sched-domain avg_load isn't
calculated at all if there is a group imbalance within the domain. This
will cause erroneous imbalance calculation.

The reason is that calculate_imbalance() sees sds->avg_load = 0 and it
will dump entire sds->max_load into imbalance variable, which is used
later on to migrate entire load from busiest CPU to the puller CPU.

This has two really bad effect:

1. stampede of task migration, and they won't be able to break out
   of the bad state because of positive feedback loop: large load
   delta -> heavier load migration -> larger imbalance and the cycle
   goes on.

2. severe imbalance in CPU queue depth.  This causes really long
   scheduling latency blip which affects badly on application that
   has tight latency requirement.

The fix is to have kernel calculate domain avg_load in both cases. This
will ensure that imbalance calculation is always sensible and the target
is usually half way between busiest and puller CPU.

Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7f00772e57c9..60f9d407c5ec 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assumes all things are equal, which typically
@@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-- 
cgit v1.2.2


From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Fri, 8 Apr 2011 12:20:16 -0700
Subject: sched: Fix erroneous all_pinned logic

The scheduler load balancer has specific code to deal with cases of
unbalanced system due to lots of unmovable tasks (for example because of
hard CPU affinity). In those situation, it excludes the busiest CPU that
has pinned tasks for load balance consideration such that it can perform
second 2nd load balance pass on the rest of the system.

This all works as designed if there is only one cgroup in the system.

However, when we have multiple cgroups, this logic has false positives and
triggers multiple load balance passes despite there are actually no pinned
tasks at all.

The reason it has false positives is that the all pinned logic is deep in
the lowest function of can_migrate_task() and is too low level:

load_balance_fair() iterates each task group and calls balance_tasks() to
migrate target load. Along the way, balance_tasks() will also set a
all_pinned variable. Given that task-groups are iterated, this all_pinned
variable is essentially the status of last group in the scanning process.
Task group can have number of reasons that no load being migrated, none
due to cpu affinity. However, this status bit is being propagated back up
to the higher level load_balance(), which incorrectly think that no tasks
were moved.  It kick off the all pinned logic and start multiple passes
attempt to move load onto puller CPU.

To fix this, move the all_pinned aggregation up at the iterator level.
This ensures that the status is aggregated over all task-groups, not just
last one in the list.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: stable@kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 60f9d407c5ec..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-	int loops = 0, pulled = 0, pinned = 0;
+	int loops = 0, pulled = 0;
 	long rem_load_move = max_load_move;
 	struct task_struct *p, *n;
 
 	if (max_load_move == 0)
 		goto out;
 
-	pinned = 1;
-
 	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate)
 			break;
 
 		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+				      all_pinned))
 			continue;
 
 		pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,9 +2152,6 @@ out:
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 
-	if (all_pinned)
-		*all_pinned = pinned;
-
 	return max_load_move - rem_load_move;
 }
 
@@ -3341,6 +3337,7 @@ redo:
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		all_pinned = 1;
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-- 
cgit v1.2.2


From f4ad9bd208c98f32a6f9136618e0b8bebe3fb370 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 8 Apr 2011 12:53:09 +0800
Subject: sched: Eliminate dead code from wakeup_gran()

calc_delta_fair() checks NICE_0_LOAD already, delete duplicate check.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Link: http://lkml.kernel.org/r/1302238389.3981.92.camel@sli10-conroe
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..4ee50f0af8d1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1789,10 +1789,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 	 * This is especially important for buddies when the leftmost
 	 * task is higher priority than the buddy.
 	 */
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		gran = calc_delta_fair(gran, se);
-
-	return gran;
+	return calc_delta_fair(gran, se);
 }
 
 /*
-- 
cgit v1.2.2


From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 7 Apr 2011 14:09:50 +0200
Subject: sched: Dynamically allocate sched_domain/sched_group data-structures

Instead of relying on static allocations for the sched_domain and
sched_group trees, dynamically allocate and RCU free them.

Allocating this dynamically also allows for some build_sched_groups()
simplification since we can now (like with other simplifications) rely
on the sched_domain tree instead of hard-coded knowledge.

One tricky to note is that detach_destroy_domains() needs to hold
rcu_read_lock() over the entire tear-down, per-cpu is not sufficient
since that can lead to partial sched_group existance (could possibly
be solved by doing the tear-down backwards but this is much more
robust).

A concequence of the above is that we can no longer print the
sched_domain debug stuff from cpu_attach_domain() since that might now
run with preemption disabled (due to classic RCU etc.) and
sched_domain_debug() does some GFP_KERNEL allocations.

Another thing to note is that we now fully rely on normal RCU and not
RCU-sched, this is because with the new and exiting RCU flavours we
grew over the years BH doesn't necessarily hold off RCU-sched grace
periods (-rt is known to break this). This would in fact already cause
us grief since we do sched_domain/sched_group iterations from softirq
context.

This patch is somewhat larger than I would like it to be, but I didn't
find any means of shrinking/splitting this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4ee50f0af8d1..4a8ac7c2a18e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
+	rcu_read_lock();
 	for_each_domain(target, sd) {
 		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
 			break;
 	}
+	rcu_read_unlock();
 
 	return target;
 }
@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		new_cpu = prev_cpu;
 	}
 
+	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			continue;
@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 
 	if (affine_sd) {
 		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-			return select_idle_sibling(p, cpu);
-		else
-			return select_idle_sibling(p, prev_cpu);
+			prev_cpu = cpu;
+
+		new_cpu = select_idle_sibling(p, prev_cpu);
+		goto unlock;
 	}
 
 	while (sd) {
@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		}
 		/* while loop will break here if sd == NULL */
 	}
+unlock:
+	rcu_read_unlock();
 
 	return new_cpu;
 }
@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	raw_spin_unlock(&this_rq->lock);
 
 	update_shares(this_cpu);
+	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		int balance = 1;
@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 			break;
 		}
 	}
+	rcu_read_unlock();
 
 	raw_spin_lock(&this_rq->lock);
 
@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data)
 	double_lock_balance(busiest_rq, target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
+	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data)
 		else
 			schedstat_inc(sd, alb_failed);
 	}
+	rcu_read_unlock();
 	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu)
 {
 	struct sched_domain *sd;
 	struct sched_group *ilb_group;
+	int ilb = nr_cpu_ids;
 
 	/*
 	 * Have idle load balancer selection from semi-idle packages only
@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu)
 	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
 		goto out_done;
 
+	rcu_read_lock();
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
 		ilb_group = sd->groups;
 
 		do {
-			if (is_semi_idle_group(ilb_group))
-				return cpumask_first(nohz.grp_idle_mask);
+			if (is_semi_idle_group(ilb_group)) {
+				ilb = cpumask_first(nohz.grp_idle_mask);
+				goto unlock;
+			}
 
 			ilb_group = ilb_group->next;
 
 		} while (ilb_group != sd->groups);
 	}
+unlock:
+	rcu_read_unlock();
 
 out_done:
-	return nr_cpu_ids;
+	return ilb;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 
 	update_shares(cpu);
 
+	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
@@ -3890,6 +3907,7 @@ out:
 		if (!balance)
 			break;
 	}
+	rcu_read_unlock();
 
 	/*
 	 * next_balance will be updated only when there is a need.
-- 
cgit v1.2.2


From a6c75f2f8d988ecfecf971f98f1cb6fc4de522fe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 7 Apr 2011 14:09:52 +0200
Subject: sched: Avoid using sd->level

Don't use sd->level for identifying properties of the domain.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122942.350174079@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4a8ac7c2a18e..9c5679cfe3b0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2651,7 +2651,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
 	 */
-	if (sd->level != SD_LV_SIBLING)
+	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
 
 	/*
-- 
cgit v1.2.2


From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 17:23:46 +0200
Subject: sched: Drop the rq argument to sched_class::select_task_rq()

In preparation of calling select_task_rq() without rq->lock held, drop
the dependency on the rq argument.

Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4ee50f0af8d1..96b2c95ac356 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1657,7 +1657,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
-- 
cgit v1.2.2


From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 17:23:47 +0200
Subject: sched: Remove rq argument to sched_class::task_waking()

In preparation of calling this without rq->lock held, remove the
dependency on the rq argument.

Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 96b2c95ac356..ad4c414f456d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1372,11 +1372,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
+	lockdep_assert_held(&task_rq(p)->lock);
+
 	se->vruntime -= cfs_rq->min_vruntime;
 }
 
-- 
cgit v1.2.2


From 3fe1698b7fe05aeb063564e71e40d09f28d8e80c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 17:23:48 +0200
Subject: sched: Deal with non-atomic min_vruntime reads on 32bits

In order to avoid reading partial updated min_vruntime values on 32bit
implement a seqcount like solution.

Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110405152729.111378493@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ad4c414f456d..054cebb81f7b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 	}
 
 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 
 /*
@@ -1376,10 +1380,21 @@ static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 min_vruntime;
 
-	lockdep_assert_held(&task_rq(p)->lock);
+#ifndef CONFIG_64BIT
+	u64 min_vruntime_copy;
 
-	se->vruntime -= cfs_rq->min_vruntime;
+	do {
+		min_vruntime_copy = cfs_rq->min_vruntime_copy;
+		smp_rmb();
+		min_vruntime = cfs_rq->min_vruntime;
+	} while (min_vruntime != min_vruntime_copy);
+#else
+	min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+	se->vruntime -= min_vruntime;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.2


From 69c80f3e9d3c569f8a3cee94ba1a324b5a7fa6b9 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Wed, 13 Apr 2011 18:21:09 -0700
Subject: sched: Make set_*_buddy() work on non-task entities

Make set_*_buddy() work on non-task sched_entity, to facilitate the
use of next_buddy to cache a group entity in cases where one of the
tasks within that entity sleeps or gets preempted.

set_skip_buddy() was incorrectly comparing the policy of task that is
yielding to be not equal to SCHED_IDLE. Yielding should happen even
when task yielding is SCHED_IDLE. This change removes the policy check
on the yielding task.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1302744070-30079-2-git-send-email-venki@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 87445931a179..501ab637b94b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1846,26 +1846,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 
 static void set_last_buddy(struct sched_entity *se)
 {
-	if (likely(task_of(se)->policy != SCHED_IDLE)) {
-		for_each_sched_entity(se)
-			cfs_rq_of(se)->last = se;
-	}
+	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+		return;
+
+	for_each_sched_entity(se)
+		cfs_rq_of(se)->last = se;
 }
 
 static void set_next_buddy(struct sched_entity *se)
 {
-	if (likely(task_of(se)->policy != SCHED_IDLE)) {
-		for_each_sched_entity(se)
-			cfs_rq_of(se)->next = se;
-	}
+	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+		return;
+
+	for_each_sched_entity(se)
+		cfs_rq_of(se)->next = se;
 }
 
 static void set_skip_buddy(struct sched_entity *se)
 {
-	if (likely(task_of(se)->policy != SCHED_IDLE)) {
-		for_each_sched_entity(se)
-			cfs_rq_of(se)->skip = se;
-	}
+	for_each_sched_entity(se)
+		cfs_rq_of(se)->skip = se;
 }
 
 /*
-- 
cgit v1.2.2


From 2f36825b176f67e5c5228aa33d828bc39718811f Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Thu, 14 Apr 2011 10:30:53 -0700
Subject: sched: Next buddy hint on sleep and preempt path

When a task in a taskgroup sleeps, pick_next_task starts all the way back at
the root and picks the task/taskgroup with the min vruntime across all
runnable tasks.

But when there are many frequently sleeping tasks across different taskgroups,
it makes better sense to stay with same taskgroup for its slice period (or
until all tasks in the taskgroup sleeps) instead of switching cross taskgroup
on each sleep after a short runtime.

This helps specifically where taskgroups corresponds to a process with
multiple threads. The change reduces the number of CR3 switches in this case.

Example:

Two taskgroups with 2 threads each which are running for 2ms and
sleeping for 1ms. Looking at sched:sched_switch shows:

BEFORE: taskgroup_1 threads [5004, 5005], taskgroup_2 threads [5016, 5017]
      cpu-soaker-5004  [003]  3683.391089
      cpu-soaker-5016  [003]  3683.393106
      cpu-soaker-5005  [003]  3683.395119
      cpu-soaker-5017  [003]  3683.397130
      cpu-soaker-5004  [003]  3683.399143
      cpu-soaker-5016  [003]  3683.401155
      cpu-soaker-5005  [003]  3683.403168
      cpu-soaker-5017  [003]  3683.405170

AFTER: taskgroup_1 threads [21890, 21891], taskgroup_2 threads [21934, 21935]
      cpu-soaker-21890 [003]   865.895494
      cpu-soaker-21935 [003]   865.897506
      cpu-soaker-21934 [003]   865.899520
      cpu-soaker-21935 [003]   865.901532
      cpu-soaker-21934 [003]   865.903543
      cpu-soaker-21935 [003]   865.905546
      cpu-soaker-21891 [003]   865.907548
      cpu-soaker-21890 [003]   865.909560
      cpu-soaker-21891 [003]   865.911571
      cpu-soaker-21890 [003]   865.913582
      cpu-soaker-21891 [003]   865.915594
      cpu-soaker-21934 [003]   865.917606

Similar problem is there when there are multiple taskgroups and say a task A
preempts currently running task B of taskgroup_1. On schedule, pick_next_task
can pick an unrelated task on taskgroup_2. Here it would be better to give some
preference to task B on pick_next_task.

A simple (may be extreme case) benchmark I tried was tbench with 2 tbench
client processes with 2 threads each running on a single CPU. Avg throughput
across 5 50 sec runs was:

 BEFORE: 105.84 MB/sec
 AFTER:  112.42 MB/sec

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1302802253-25760-1-git-send-email-venki@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 501ab637b94b..5280272cce3e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1344,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	hrtick_update(rq);
 }
 
+static void set_next_buddy(struct sched_entity *se);
+
 /*
  * The dequeue_task method is called before nr_running is
  * decreased. We remove the task from the rbtree and
@@ -1353,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int task_sleep = flags & DEQUEUE_SLEEP;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		if (cfs_rq->load.weight) {
+			/*
+			 * Bias pick_next to pick a task from this cfs_rq, as
+			 * p is sleeping when it is within its sched_slice.
+			 */
+			if (task_sleep && parent_entity(se))
+				set_next_buddy(parent_entity(se));
 			break;
+		}
 		flags |= DEQUEUE_SLEEP;
 	}
 
@@ -1877,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
+	int next_buddy_marked = 0;
 
 	if (unlikely(se == pse))
 		return;
 
-	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
+		next_buddy_marked = 1;
+	}
 
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1910,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	update_curr(cfs_rq);
 	find_matching_se(&se, &pse);
 	BUG_ON(!pse);
-	if (wakeup_preempt_entity(se, pse) == 1)
+	if (wakeup_preempt_entity(se, pse) == 1) {
+		/*
+		 * Bias pick_next to pick the sched entity that is
+		 * triggering this preemption.
+		 */
+		if (!next_buddy_marked)
+			set_next_buddy(pse);
 		goto preempt;
+	}
 
 	return;
 
-- 
cgit v1.2.2


From 931aeeda0dca81152aec48f30be01e86a268bf89 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 3 May 2011 22:31:07 +0400
Subject: sched: Remove unused 'this_best_prio arg' from balance_tasks()

It's passed across multiple functions but is never really used, so
remove it.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1304447467-29200-1-git-send-email-vdavydov@parallels.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5280272cce3e..37f22626225e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2142,7 +2142,7 @@ static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
+	      struct cfs_rq *busiest_cfs_rq)
 {
 	int loops = 0, pulled = 0;
 	long rem_load_move = max_load_move;
@@ -2180,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 */
 		if (rem_load_move <= 0)
 			break;
-
-		if (p->prio < *this_best_prio)
-			*this_best_prio = p->prio;
 	}
 out:
 	/*
@@ -2242,7 +2239,7 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	long rem_load_move = max_load_move;
 	int busiest_cpu = cpu_of(busiest);
@@ -2267,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
 		moved_load = balance_tasks(this_rq, this_cpu, busiest,
-				rem_load, sd, idle, all_pinned, this_best_prio,
+				rem_load, sd, idle, all_pinned,
 				busiest_cfs_rq);
 
 		if (!moved_load)
@@ -2293,11 +2290,11 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
+		  int *all_pinned)
 {
 	return balance_tasks(this_rq, this_cpu, busiest,
 			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &busiest->cfs);
+			&busiest->cfs);
 }
 #endif
 
@@ -2314,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      int *all_pinned)
 {
 	unsigned long total_load_moved = 0, load_moved;
-	int this_best_prio = this_rq->curr->prio;
 
 	do {
 		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
 				max_load_move - total_load_moved,
-				sd, idle, all_pinned, &this_best_prio);
+				sd, idle, all_pinned);
 
 		total_load_moved += load_moved;
 
-- 
cgit v1.2.2


From 1399fa7807a1a5998bbf147e80668e9950661dfa Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Wed, 18 May 2011 10:09:39 -0700
Subject: sched: Introduce SCHED_POWER_SCALE to scale cpu_power calculations

SCHED_LOAD_SCALE is used to increase nice resolution and to
scale cpu_power calculations in the scheduler. This patch
introduces SCHED_POWER_SCALE and converts all uses of
SCHED_LOAD_SCALE for scaling cpu_power to use SCHED_POWER_SCALE
instead.

This is a preparatory patch for increasing the resolution of
SCHED_LOAD_SCALE, and there is no need to increase resolution
for cpu_power calculations.

Signed-off-by: Nikhil Rao <ncrao@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de>
Cc: Mike Galbraith <efault@gmx.de>
Link: http://lkml.kernel.org/r/1305738580-9924-3-git-send-email-ncrao@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 52 +++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37f22626225e..e32a9b70ee9c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 				nr_running += cpu_rq(i)->cfs.nr_running;
 			}
 
-			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
 
 			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 				nr_running /= 2;
@@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
-	return SCHED_LOAD_SCALE;
+	return SCHED_POWER_SCALE;
 }
 
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu)
 		available = total - rq->rt_avg;
 	}
 
-	if (unlikely((s64)total < SCHED_LOAD_SCALE))
-		total = SCHED_LOAD_SCALE;
+	if (unlikely((s64)total < SCHED_POWER_SCALE))
+		total = SCHED_POWER_SCALE;
 
-	total >>= SCHED_LOAD_SHIFT;
+	total >>= SCHED_POWER_SHIFT;
 
 	return div_u64(available, total);
 }
@@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu)
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
-	unsigned long power = SCHED_LOAD_SCALE;
+	unsigned long power = SCHED_POWER_SCALE;
 	struct sched_group *sdg = sd->groups;
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		else
 			power *= default_scale_smt_power(sd, cpu);
 
-		power >>= SCHED_LOAD_SHIFT;
+		power >>= SCHED_POWER_SHIFT;
 	}
 
 	sdg->cpu_power_orig = power;
@@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	else
 		power *= default_scale_freq_power(sd, cpu);
 
-	power >>= SCHED_LOAD_SHIFT;
+	power >>= SCHED_POWER_SHIFT;
 
 	power *= scale_rt_power(cpu);
-	power >>= SCHED_LOAD_SHIFT;
+	power >>= SCHED_POWER_SHIFT;
 
 	if (!power)
 		power = 1;
@@ -2682,7 +2682,7 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
 	/*
-	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
+	 * Only siblings can have significantly less than SCHED_POWER_SCALE
 	 */
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
@@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
 	sgs->group_weight = group->group_weight;
@@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
 		return 0;
 
 	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
-				       SCHED_LOAD_SCALE);
+				       SCHED_POWER_SCALE);
 	return 1;
 }
 
@@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 			cpu_avg_load_per_task(this_cpu);
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
-						 * SCHED_LOAD_SCALE;
+					 * SCHED_POWER_SCALE;
 	scaled_busy_load_per_task /= sds->busiest->cpu_power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 			min(sds->busiest_load_per_task, sds->max_load);
 	pwr_now += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load);
-	pwr_now /= SCHED_LOAD_SCALE;
+	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
-	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
 		sds->busiest->cpu_power;
 	if (sds->max_load > tmp)
 		pwr_move += sds->busiest->cpu_power *
@@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 
 	/* Amount of load we'd add */
 	if (sds->max_load * sds->busiest->cpu_power <
-		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+		sds->busiest_load_per_task * SCHED_POWER_SCALE)
 		tmp = (sds->max_load * sds->busiest->cpu_power) /
 			sds->this->cpu_power;
 	else
-		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
 			sds->this->cpu_power;
 	pwr_move += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
-	pwr_move /= SCHED_LOAD_SCALE;
+	pwr_move /= SCHED_POWER_SCALE;
 
 	/* Move if we gain throughput */
 	if (pwr_move > pwr_now)
@@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 		load_above_capacity = (sds->busiest_nr_running -
 						sds->busiest_group_capacity);
 
-		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
 		load_above_capacity /= sds->busiest->cpu_power;
 	}
@@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * sds->busiest->cpu_power,
 		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
-			/ SCHED_LOAD_SCALE;
+			/ SCHED_POWER_SCALE;
 
 	/*
 	 * if *imbalance is less than the average load per runnable task
@@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
@@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 
 	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long power = power_of(i);
-		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+		unsigned long capacity = DIV_ROUND_CLOSEST(power,
+							   SCHED_POWER_SCALE);
 		unsigned long wl;
 
 		if (!capacity)
@@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 		 * the load can be moved away from the cpu that is potentially
 		 * running at a lower capacity.
 		 */
-		wl = (wl * SCHED_LOAD_SCALE) / power;
+		wl = (wl * SCHED_POWER_SCALE) / power;
 
 		if (wl > max_load) {
 			max_load = wl;
-- 
cgit v1.2.2


From 1e876231785d82443a5ac8b6c660e9f51bc5dede Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 May 2011 16:21:10 -0700
Subject: sched: Fix ->min_vruntime calculation in dequeue_entity()

Dima Zavin <dima@android.com> reported:

"After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead."

Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Recalls-having-tested-once-upon-a-time-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1305674470-23727-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b70ee9c..433491c2dc8f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	se->on_rq = 0;
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
-	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
+
+	update_min_vruntime(cfs_rq);
+	update_cfs_shares(cfs_rq);
 }
 
 /*
-- 
cgit v1.2.2


From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Jul 2011 13:00:06 +0200
Subject: sched: Break out cpu_power from the sched_group structure

In order to prepare for non-unique sched_groups per domain, we need to
carry the cpu_power elsewhere, so put a level of indirection in.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..c768588e180b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_POWER_SHIFT;
 	}
 
-	sdg->cpu_power_orig = power;
+	sdg->sgp->power_orig = power;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power = 1;
 
 	cpu_rq(cpu)->cpu_power = power;
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 
 	group = child->groups;
 	do {
-		power += group->cpu_power;
+		power += group->sgp->power;
 		group = group->next;
 	} while (group != child->groups);
 
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 /*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * If ~90% of the cpu_power is still there, we're good.
 	 */
-	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
 		return 1;
 
 	return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
 						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += sg->cpu_power;
+		sds->total_pwr += sg->sgp->power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
 	if (this_cpu > busiest_cpu)
 		return 0;
 
-	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
 				       SCHED_POWER_SCALE);
 	return 1;
 }
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
 					 * SCHED_POWER_SCALE;
-	scaled_busy_load_per_task /= sds->busiest->cpu_power;
+	scaled_busy_load_per_task /= sds->busiest->sgp->power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
 			(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	 * moving them.
 	 */
 
-	pwr_now += sds->busiest->cpu_power *
+	pwr_now += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->cpu_power *
+	pwr_now += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-		sds->busiest->cpu_power;
+		sds->busiest->sgp->power;
 	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->cpu_power *
+		pwr_move += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 
 	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->cpu_power <
+	if (sds->max_load * sds->busiest->sgp->power <
 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
-		tmp = (sds->max_load * sds->busiest->cpu_power) /
-			sds->this->cpu_power;
+		tmp = (sds->max_load * sds->busiest->sgp->power) /
+			sds->this->sgp->power;
 	else
 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-			sds->this->cpu_power;
-	pwr_move += sds->this->cpu_power *
+			sds->this->sgp->power;
+	pwr_move += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_POWER_SCALE;
 
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
-		load_above_capacity /= sds->busiest->cpu_power;
+		load_above_capacity /= sds->busiest->sgp->power;
 	}
 
 	/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
+	*imbalance = min(max_pull * sds->busiest->sgp->power,
+		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
 			/ SCHED_POWER_SCALE;
 
 	/*
-- 
cgit v1.2.2