Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers

author: Jens Axboe <axboe@kernel.dk> 2012-07-30 03:03:10 -0400
committer: Jens Axboe <axboe@kernel.dk> 2012-07-30 03:03:10 -0400
commit: 72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree: 4c67be6c73356086ff44ef1b8b1c9479702689ca /kernel/sched
parent: b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parent: a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)
5 files changed, 479 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-        NULL
 };
 #undef SCHED_FEAT
@@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
-        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+static inline int calc_load_write_idx(void)
+{
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
-void calc_load_account_idle(struct rq *this_rq)
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
 {
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
        /*
-         * Its got a race, we don't care...
+         * If we're still before the sample window, we're done.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        if (time_before(jiffies, this_rq->calc_load_update))
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+                return;
+        /*
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
+         */
+        this_rq->calc_load_update = calc_load_update;
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
-         * It could be the one fold was all it took, we done!
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Catch-up, fold however many we are behind still
-         */
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
+                active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
+                calc_load_update += n * LOAD_FREQ;
-}
+        }
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
@@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
 /*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-        unsigned long curr_jiffies = jiffies;
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
        unsigned long load = this_rq->load.weight;
        unsigned long pending_updates;
        /*
-         * Bloody broken means of dealing with nohz, but better than nothing..
+         * bail if there's load or we're actually up-to-date.
-         * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
-         * update and see 0 difference the one time and 2 the next, even though
-         * we ticked at roughtly the same rate.
-         *
-         * Hence we only use this from nohz_idle_balance() and skip this
-         * nonsense when called from the scheduler_tick() since that's
-         * guaranteed a stable rate.
         */
        if (load || curr_jiffies == this_rq->last_load_update_tick)
                return;
@@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq)
 }
 /*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
 * Called from scheduler_tick()
 */
 static void update_cpu_load_active(struct rq *this_rq)
 {
        /*
-         * See the mess in update_idle_cpu_load().
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
         */
        this_rq->last_load_update_tick = jiffies;
        __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
@@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-        sched_domain_debug_enabled = 1;
+        sched_debug_enabled = 1;
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+static inline bool sched_debug(void)
+{
+        return sched_debug_enabled;
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->sgp->power) {
+                /*
+                 * Even though we initialize ->power to something semi-sane,
+                 * we leave power_orig unset. This allows us to detect if
+                 * domain iteration is still funny without causing /0 traps.
+                 */
+                if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
-        if (!sched_domain_debug_enabled)
+        if (!sched_debug_enabled)
                return;
        if (!sd) {
@@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+        return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6138,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
-                if (cpumask_test_cpu(cpu, sg_span))
+                /*
+                 * Initialize sgp->power such that even if we mess up the
+                 * domains and no possible iteration will get us here, we won't
+                 * die on a /0 trap.
+                 */
+                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
+                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                    group_balance_cpu(sg) == cpu)
                        groups = sg;
                if (!first)
@@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6165,11 +6398,8 @@ int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
-        unsigned long val;
+        if (kstrtoint(str, 0, &default_relax_domain_level))
+                pr_warn("Unable to set relax_domain_level\n");
-        val = simple_strtoul(str, NULL, 0);
-        if (val < sched_domain_level_max)
-                default_relax_domain_level = val;
        return 1;
 }
@@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 static inline int sd_local_flags(int level)
 {
-        if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
                return 0;
        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
+static void sched_numa_warn(const char *str)
+{
+        static int done = false;
+        int i,j;
+        if (done)
+                return;
+        done = true;
+        printk(KERN_WARNING "ERROR: %s\n\n", str);
+        for (i = 0; i < nr_node_ids; i++) {
+                printk(KERN_WARNING "  ");
+                for (j = 0; j < nr_node_ids; j++)
+                        printk(KERN_CONT "%02d ", node_distance(i,j));
+                printk(KERN_CONT "\n");
+        }
+        printk(KERN_WARNING "\n");
+}
+static bool find_numa_distance(int distance)
+{
+        int i;
+        if (distance == node_distance(0, 0))
+                return true;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                if (sched_domains_numa_distance[i] == distance)
+                        return true;
+        }
+        return false;
+}
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6616,6 @@ static void sched_init_numa(void)
        int level = 0;
        int i, j, k;
-        sched_domains_numa_scale = curr_distance;
        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
        if (!sched_domains_numa_distance)
                return;
@@ -6362,23 +6626,41 @@ static void sched_init_numa(void)
         *
         * Assumes node_distance(0,j) includes all distances in
         * node_distance(i,j) in order to avoid cubic time.
-         *
-         * XXX: could be optimized to O(n log n) by using sort()
         */
        next_distance = curr_distance;
        for (i = 0; i < nr_node_ids; i++) {
                for (j = 0; j < nr_node_ids; j++) {
-                        int distance = node_distance(0, j);
+                        for (k = 0; k < nr_node_ids; k++) {
-                        if (distance > curr_distance &&
+                                int distance = node_distance(i, k);
-                                        (distance < next_distance ||
-                                         next_distance == curr_distance))
+                                if (distance > curr_distance &&
-                                next_distance = distance;
+                                    (distance < next_distance ||
+                                     next_distance == curr_distance))
+                                        next_distance = distance;
+                                /*
+                                 * While not a strong assumption it would be nice to know
+                                 * about cases where if node A is connected to B, B is not
+                                 * equally connected to A.
+                                 */
+                                if (sched_debug() && node_distance(k, i) != distance)
+                                        sched_numa_warn("Node-distance not symmetric");
+                                if (sched_debug() && i && !find_numa_distance(distance))
+                                        sched_numa_warn("Node-0 not representative");
+                        }
+                        if (next_distance != curr_distance) {
+                                sched_domains_numa_distance[level++] = next_distance;
+                                sched_domains_numa_levels = level;
+                                curr_distance = next_distance;
+                        } else break;
                }
-                if (next_distance != curr_distance) {
-                        sched_domains_numa_distance[level++] = next_distance;
+                /*
-                        sched_domains_numa_levels = level;
+                 * In case of sched_debug() we verify the above assumption.
-                        curr_distance = next_distance;
+                 */
-                } else break;
+                if (!sched_debug())
+                        break;
        }
        /*
         * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6685,7 @@ static void sched_init_numa(void)
                        return;
                for (j = 0; j < nr_node_ids; j++) {
-                        struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
                        if (!mask)
                                return;
@@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
-        set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+        set_domain_attribute(sd, attr);
        return sd;
 }
@@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
+        u64 total, available, age_stamp, avg;
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        /*
+         * Since we're reading these variables without serialization make sure
+         * we read them once before doing sanity checks on them.
+         */
+        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        avg = ACCESS_ONCE(rq->rt_avg);
+        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < rq->rt_avg)) {
+        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
                available = 0;
        } else {
-                available = total - rq->rt_avg;
+                available = total - avg;
        }
        if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
        power = 0;
-        group = child->groups;
+        if (child->flags & SD_OVERLAP) {
-        do {
+                /*
-                power += group->sgp->power;
+                 * SD_OVERLAP domains cannot assume that child groups
-                group = group->next;
+                 * span the current group.
-        } while (group != child->groups);
+                 */
-        sdg->sgp->power = power;
+                for_each_cpu(cpu, sched_group_cpus(sdg))
+                        power += power_of(cpu);
+        } else  {
+                /*
+                 * !SD_OVERLAP domains can assume that child groups
+                 * span the current group.
+                 */ 
+                group = child->groups;
+                do {
+                        power += group->sgp->power;
+                        group = group->next;
+                } while (group != child->groups);
+        }
+        sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 /*
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 /**
 * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
- * @this_cpu: the current cpu
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 * Returns 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
 */
 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 {
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *                      amongst the groups of a sched_domain, during
 *                      load balancing.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
 */
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
- * @sd: The sched_domain whose busiest group is to be returned.
+ * @env: The load balancing environment.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        calc_load_account_idle(rq);
        return rq->idle;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total++;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
        update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total--;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
        update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
        inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                goto out;
        /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         * will have to sort it out.
         */
        if (curr && unlikely(rt_task(curr)) &&
-            (curr->rt.nr_cpus_allowed < 2 ||
+            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio) &&
-            (p->rt.nr_cpus_allowed > 1)) {
+            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        if (rq->curr->rt.nr_cpus_allowed == 1)
+        if (rq->curr->nr_cpus_allowed == 1)
                return;
-        if (p->rt.nr_cpus_allowed != 1
+        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->rt.nr_cpus_allowed > 1))
+            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (unlikely(!lowest_mask))
                return -1;
-        if (task->rt.nr_cpus_allowed == 1)
+        if (task->nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
-                                raw_spin_unlock(&lowest_rq->lock);
+                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(rq->cpu != task_cpu(p));
        BUG_ON(task_current(rq, p));
-        BUG_ON(p->rt.nr_cpus_allowed <= 1);
+        BUG_ON(p->nr_cpus_allowed <= 1);
        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1 &&
+            p->nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
         * Only update if the process changes its state from whether it
         * can migrate or not.
         */
-        if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
                return;
        rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
        watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = RR_TIMESLICE;
        /*
-         * Requeue to the end of queue if we are not the only element
+         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * on the queue:
+         * only element on the queue
         */
-        if (p->rt.run_list.prev != p->rt.run_list.next) {
+        for_each_sched_rt_entity(rt_se) {
-                requeue_task_rt(rq, p, 0);
+                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                set_tsk_need_resched(p);
+                        requeue_task_rt(rq, p, 0);
+                        set_tsk_need_resched(p);
+                        return;
+                }
        }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"
@@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void)
        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
-void calc_load_account_idle(struct rq *this_rq);
 #ifdef CONFIG_SCHED_HRTICK
 /*
author	Jens Axboe <axboe@kernel.dk>	2012-07-30 03:03:10 -0400
committer	Jens Axboe <axboe@kernel.dk>	2012-07-30 03:03:10 -0400
commit	72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree	4c67be6c73356086ff44ef1b8b1c9479702689ca /kernel/sched
parent	b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parent	a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)