Merge commit '8700c95adb03' into timers/nohz

The full dynticks tree needs the latest RCU and sched upstream updates in order to fix some dependencies. Merge a common upstream merge point that has these updates. Conflicts: include/linux/perf_event.h kernel/rcutree.h kernel/rcutree_plugin.h Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
author: Frederic Weisbecker <fweisbec@gmail.com> 2013-05-02 11:37:49 -0400
committer: Frederic Weisbecker <fweisbec@gmail.com> 2013-05-02 11:54:19 -0400
commit: c032862fba51a3ca504752d3a25186b324c5ce83 (patch)
tree: 955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/sched
parent: fda76e074c7737fc57855dd17c762e50ed526052 (diff)
parent: 8700c95adb033843fc163d112b9d21d4fda78018 (diff)
10 files changed, 464 insertions, 410 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
        u64 this_clock, remote_clock;
        u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+        /*
+         * Careful here: The local and the remote clock values need to
+         * be read out atomic as we need to compare the values and
+         * then update either the local or the remote side. So the
+         * cmpxchg64 below only protects one readout.
+         *
+         * We must reread via sched_clock_local() in the retry case on
+         * 32bit as an NMI could use sched_clock_local() via the
+         * tracer and hit between the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        this_clock = sched_clock_local(my_scd);
+        /*
+         * We must enforce atomic readout on 32bit, otherwise the
+         * update on the remote cpu can hit inbetween the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+        /*
+         * On 64bit the read of [my]scd->clock is atomic versus the
+         * update, so we can avoid the above 32bit dance.
+         */
        sched_clock_local(my_scd);
 again:
        this_clock = my_scd->clock;
        remote_clock = scd->clock;
+#endif
        /*
         * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dd09def88567..e94842d4400c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
 * the target CPU.
 */
 #ifdef CONFIG_SMP
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
 void resched_task(struct task_struct *p)
 {
        int cpu;
@@ -1536,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        BUG_ON(rq != this_rq());
+        if (WARN_ON_ONCE(rq != this_rq()) ||
-        BUG_ON(p == current);
+            WARN_ON_ONCE(p == current))
+                return;
        lockdep_assert_held(&rq->lock);
        if (!raw_spin_trylock(&p->pi_lock)) {
@@ -3037,51 +3034,6 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-        if (!sched_feat(OWNER_SPIN))
-                return 0;
-        rcu_read_lock();
-        while (owner_running(lock, owner)) {
-                if (need_resched())
-                        break;
-                arch_mutex_cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
-}
-#endif
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -4170,6 +4122,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        get_task_struct(p);
        rcu_read_unlock();
+        if (p->flags & PF_NO_SETAFFINITY) {
+                retval = -EINVAL;
+                goto out_put_task;
+        }
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
                goto out_put_task;
@@ -4817,11 +4773,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                ret = -EINVAL;
-                goto out;
-        }
        do_set_cpus_allowed(p, new_mask);
        /* Can the task run on the task's current CPU? If so, we're done */
@@ -5043,7 +4994,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
@@ -6292,7 +6243,7 @@ static void sched_init_numa(void)
         * 'level' contains the number of unique distances, excluding the
         * identity distance node_distance(i,i).
         *
-         * The sched_domains_nume_distance[] array includes the actual distance
+         * The sched_domains_numa_distance[] array includes the actual distance
         * numbers.
         */
@@ -6913,7 +6864,7 @@ struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 void __init sched_init(void)
 {
@@ -6950,7 +6901,7 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
-                        per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                        per_cpu(load_balance_mask, i) = (void *)ptr;
                        ptr += cpumask_size();
                }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6976,12 +6927,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-        root_cpuacct.cpustat = &kernel_cpustat;
-        root_cpuacct.cpuusage = alloc_percpu(u64);
-        /* Too early, not expected to fail */
-        BUG_ON(!root_cpuacct.cpuusage);
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -8083,226 +8028,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 #endif  /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-struct cpuacct root_cpuacct;
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
-        struct cpuacct *ca;
-        if (!cgrp->parent)
-                return &root_cpuacct.css;
-        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-        if (!ca)
-                goto out;
-        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage)
-                goto out_free_ca;
-        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-        if (!ca->cpustat)
-                goto out_free_cpuusage;
-        return &ca->css;
-out_free_cpuusage:
-        free_percpu(ca->cpuusage);
-out_free_ca:
-        kfree(ca);
-out:
-        return ERR_PTR(-ENOMEM);
-}
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        free_percpu(ca->cpustat);
-        free_percpu(ca->cpuusage);
-        kfree(ca);
-}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-        u64 data;
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        data = *cpuusage;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        data = *cpuusage;
-#endif
-        return data;
-}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        *cpuusage = val;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        *cpuusage = val;
-#endif
-}
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        u64 totalcpuusage = 0;
-        int i;
-        for_each_present_cpu(i)
-                totalcpuusage += cpuacct_cpuusage_read(ca, i);
-        return totalcpuusage;
-}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                                u64 reset)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int err = 0;
-        int i;
-        if (reset) {
-                err = -EINVAL;
-                goto out;
-        }
-        for_each_present_cpu(i)
-                cpuacct_cpuusage_write(ca, i, 0);
-out:
-        return err;
-}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                   struct seq_file *m)
-{
-        struct cpuacct *ca = cgroup_ca(cgroup);
-        u64 percpu;
-        int i;
-        for_each_present_cpu(i) {
-                percpu = cpuacct_cpuusage_read(ca, i);
-                seq_printf(m, "%llu ", (unsigned long long) percpu);
-        }
-        seq_printf(m, "\n");
-        return 0;
-}
-static const char *cpuacct_stat_desc[] = {
-        [CPUACCT_STAT_USER] = "user",
-        [CPUACCT_STAT_SYSTEM] = "system",
-};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                              struct cgroup_map_cb *cb)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int cpu;
-        s64 val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_USER];
-                val += kcpustat->cpustat[CPUTIME_NICE];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_SYSTEM];
-                val += kcpustat->cpustat[CPUTIME_IRQ];
-                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-        return 0;
-}
-static struct cftype files[] = {
-        {
-                .name = "usage",
-                .read_u64 = cpuusage_read,
-                .write_u64 = cpuusage_write,
-        },
-        {
-                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
-        },
-        {
-                .name = "stat",
-                .read_map = cpuacct_stats_show,
-        },
-        { }     /* terminate */
-};
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-        struct cpuacct *ca;
-        int cpu;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        cpu = task_cpu(tsk);
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        for (; ca; ca = parent_ca(ca)) {
-                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-                *cpuusage += cputime;
-        }
-        rcu_read_unlock();
-}
-struct cgroup_subsys cpuacct_subsys = {
-        .name = "cpuacct",
-        .css_alloc = cpuacct_css_alloc,
-        .css_free = cpuacct_css_free,
-        .subsys_id = cpuacct_subsys_id,
-        .base_cftypes = files,
-};
-#endif  /* CONFIG_CGROUP_CPUACCT */
 void dump_cpu_task(int cpu)
 {
        pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+#include "sched.h"
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+        struct cgroup_subsys_state css;
+        /* cpuusage holds pointer to a u64-type object on every cpu */
+        u64 __percpu *cpuusage;
+        struct kernel_cpustat __percpu *cpustat;
+};
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
+{
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+        if (!ca->css.cgroup->parent)
+                return NULL;
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+        .cpustat        = &kernel_cpustat,
+        .cpuusage       = &root_cpuacct_cpuusage,
+};
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+{
+        struct cpuacct *ca;
+        if (!cgrp->parent)
+                return &root_cpuacct.css;
+        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        if (!ca)
+                goto out;
+        ca->cpuusage = alloc_percpu(u64);
+        if (!ca->cpuusage)
+                goto out_free_ca;
+        ca->cpustat = alloc_percpu(struct kernel_cpustat);
+        if (!ca->cpustat)
+                goto out_free_cpuusage;
+        return &ca->css;
+out_free_cpuusage:
+        free_percpu(ca->cpuusage);
+out_free_ca:
+        kfree(ca);
+out:
+        return ERR_PTR(-ENOMEM);
+}
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup *cgrp)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        free_percpu(ca->cpustat);
+        free_percpu(ca->cpuusage);
+        kfree(ca);
+}
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+        u64 data;
+#ifndef CONFIG_64BIT
+        /*
+         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+         */
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+        data = *cpuusage;
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+        data = *cpuusage;
+#endif
+        return data;
+}
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+#ifndef CONFIG_64BIT
+        /*
+         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+         */
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+        *cpuusage = val;
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+        *cpuusage = val;
+#endif
+}
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        u64 totalcpuusage = 0;
+        int i;
+        for_each_present_cpu(i)
+                totalcpuusage += cpuacct_cpuusage_read(ca, i);
+        return totalcpuusage;
+}
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+                                                                u64 reset)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int err = 0;
+        int i;
+        if (reset) {
+                err = -EINVAL;
+                goto out;
+        }
+        for_each_present_cpu(i)
+                cpuacct_cpuusage_write(ca, i, 0);
+out:
+        return err;
+}
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct seq_file *m)
+{
+        struct cpuacct *ca = cgroup_ca(cgroup);
+        u64 percpu;
+        int i;
+        for_each_present_cpu(i) {
+                percpu = cpuacct_cpuusage_read(ca, i);
+                seq_printf(m, "%llu ", (unsigned long long) percpu);
+        }
+        seq_printf(m, "\n");
+        return 0;
+}
+static const char * const cpuacct_stat_desc[] = {
+        [CPUACCT_STAT_USER] = "user",
+        [CPUACCT_STAT_SYSTEM] = "system",
+};
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                              struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int cpu;
+        s64 val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_USER];
+                val += kcpustat->cpustat[CPUTIME_NICE];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+        val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_SYSTEM];
+                val += kcpustat->cpustat[CPUTIME_IRQ];
+                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+        return 0;
+}
+static struct cftype files[] = {
+        {
+                .name = "usage",
+                .read_u64 = cpuusage_read,
+                .write_u64 = cpuusage_write,
+        },
+        {
+                .name = "usage_percpu",
+                .read_seq_string = cpuacct_percpu_seq_read,
+        },
+        {
+                .name = "stat",
+                .read_map = cpuacct_stats_show,
+        },
+        { }     /* terminate */
+};
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+        struct cpuacct *ca;
+        int cpu;
+        cpu = task_cpu(tsk);
+        rcu_read_lock();
+        ca = task_ca(tsk);
+        while (true) {
+                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+                *cpuusage += cputime;
+                ca = parent_ca(ca);
+                if (!ca)
+                        break;
+        }
+        rcu_read_unlock();
+}
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+        struct kernel_cpustat *kcpustat;
+        struct cpuacct *ca;
+        rcu_read_lock();
+        ca = task_ca(p);
+        while (ca != &root_cpuacct) {
+                kcpustat = this_cpu_ptr(ca->cpustat);
+                kcpustat->cpustat[index] += val;
+                ca = __parent_ca(ca);
+        }
+        rcu_read_unlock();
+}
+struct cgroup_subsys cpuacct_subsys = {
+        .name           = "cpuacct",
+        .css_alloc      = cpuacct_css_alloc,
+        .css_free       = cpuacct_css_free,
+        .subsys_id      = cpuacct_subsys_id,
+        .base_cftypes   = files,
+        .early_init     = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 699d59756ece..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
 static inline void task_group_account_field(struct task_struct *p, int index,
                                            u64 tmp)
 {
-#ifdef CONFIG_CGROUP_CPUACCT
-        struct kernel_cpustat *kcpustat;
-        struct cpuacct *ca;
-#endif
        /*
         * Since all updates are sure to touch the root cgroup, we
         * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
         */
        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
+        cpuacct_account_field(p, index, tmp);
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        rcu_read_lock();
-        ca = task_ca(p);
-        while (ca && (ca != &root_cpuacct)) {
-                kcpustat = this_cpu_ptr(ca->cpustat);
-                kcpustat->cpustat[index] += tmp;
-                ca = parent_ca(ca);
-        }
-        rcu_read_unlock();
-#endif
 }
 /*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                task_cputime(tsk, &utime, &stime);
+                task_cputime(t, &utime, &stime);
                times->utime += utime;
                times->stime += stime;
                times->sum_exec_runtime += task_sched_runtime(t);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c97fca091a7..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+        update_rq_runnable_avg(this_rq, 1);
+}
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+        update_rq_runnable_avg(this_rq, 0);
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq) {}
@@ -3875,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
-         * 1) running (obviously), or
+         * 1) throttled_lb_pair, or
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
+         * 3) running (obviously), or
+         * 4) are cache-hot on their current CPU.
         */
+        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+                return 0;
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
-                int new_dst_cpu;
+                int cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3895,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
                        return 0;
-                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                /* Prevent to re-select dst_cpu via env's cpus */
-                                                tsk_cpus_allowed(p));
+                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                if (new_dst_cpu < nr_cpu_ids) {
+                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                        env->flags |= LBF_SOME_PINNED;
+                                env->flags |= LBF_SOME_PINNED;
-                        env->new_dst_cpu = new_dst_cpu;
+                                env->new_dst_cpu = cpu;
+                                break;
+                        }
                }
                return 0;
        }
@@ -3921,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
-#endif
                return 1;
        }
-        if (tsk_cache_hot) {
+        schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
-                schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+        return 0;
-                return 0;
-        }
-        return 1;
 }
 /*
@@ -3949,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
        struct task_struct *p, *n;
        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-                        continue;
                if (!can_migrate_task(p, env))
                        continue;
@@ -4003,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
                        break;
                }
-                if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+                if (!can_migrate_task(p, env))
                        goto next;
                load = task_h_load(p);
@@ -4014,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
                if ((load / 2) > env->imbalance)
                        goto next;
-                if (!can_migrate_task(p, env))
-                        goto next;
                move_task(p, env);
                pulled++;
                env->imbalance -= load;
@@ -4961,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 #define MAX_PINNED_INTERVAL     512
 /* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static int need_active_balance(struct lb_env *env)
 {
@@ -4992,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        int *balance)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
-        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        struct cpumask *cpus = __get_cpu_var(load_balance_mask);
        struct lb_env env = {
                .sd             = sd,
@@ -5008,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .cpus           = cpus,
        };
+        /*
+         * For NEWLY_IDLE load_balancing, we don't need to consider
+         * other cpus in our group
+         */
+        if (idle == CPU_NEWLY_IDLE)
+                env.dst_grpmask = NULL;
        cpumask_copy(cpus, cpu_active_mask);
-        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
@@ -5035,7 +5059,6 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
-        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -5062,17 +5085,17 @@ more_balance:
                double_rq_unlock(env.dst_rq, busiest);
                local_irq_restore(flags);
-                if (env.flags & LBF_NEED_BREAK) {
-                        env.flags &= ~LBF_NEED_BREAK;
-                        goto more_balance;
-                }
                /*
                 * some other cpu did the load balance for us.
                 */
                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
                        resched_cpu(env.dst_cpu);
+                if (env.flags & LBF_NEED_BREAK) {
+                        env.flags &= ~LBF_NEED_BREAK;
+                        goto more_balance;
+                }
                /*
                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
                 * us and move them to an alternate dst_cpu in our sched_group
@@ -5092,14 +5115,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
-                                lb_iterations++ < max_lb_iterations) {
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
                        env.flags       &= ~LBF_SOME_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
+                        /* Prevent to re-select dst_cpu via env's cpus */
+                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5220,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
-        update_rq_runnable_avg(this_rq, 1);
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
@@ -5396,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        clear_bit(NOHZ_IDLE, nohz_flags(cpu));
        rcu_read_lock();
-        for_each_domain(cpu, sd)
+        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        if (!sd || !sd->nohz_idle)
+                goto unlock;
+        sd->nohz_idle = 0;
+        for (; sd; sd = sd->parent)
                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
        rcu_read_unlock();
 }
@@ -5411,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        set_bit(NOHZ_IDLE, nohz_flags(cpu));
        rcu_read_lock();
-        for_each_domain(cpu, sd)
+        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        if (!sd || sd->nohz_idle)
+                goto unlock;
+        sd->nohz_idle = 1;
+        for (; sd; sd = sd->parent)
                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
        rcu_read_unlock();
 }
@@ -5469,7 +5499,7 @@ void update_max_interval(void)
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
 */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
@@ -5507,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
-                                 * We've pulled tasks over so either we're no
+                                 * The LBF_SOME_PINNED logic could have changed
-                                 * longer idle.
+                                 * env->dst_cpu, so we can't know our idle
+                                 * state even if we migrated tasks. Update it.
                                 */
-                                idle = CPU_NOT_IDLE;
+                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
                }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 /*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-/*
 * Decrement CPU power based on time not spent running tasks
 */
 SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+        idle_exit_fair(rq);
+}
+static void post_schedule_idle(struct rq *rq)
+{
+        idle_enter_fair(rq);
+}
 #endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+        /* Trigger the post schedule to do an idle_enter for CFS */
+        rq->post_schedule = 1;
+#endif
        return rq->idle;
 }
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
+        .pre_schedule           = pre_schedule_idle,
+        .post_schedule          = post_schedule_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb363aa5d83c..24dc29897749 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -8,6 +8,7 @@
 #include <linux/tick.h>
 #include "cpupri.h"
+#include "cpuacct.h"
 extern __read_mostly int scheduler_running;
@@ -951,14 +952,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-        CPUACCT_STAT_USER,      /* ... user mode */
-        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
-        CPUACCT_STAT_NSTATS,
-};
 #define ENQUEUE_WAKEUP          1
 #define ENQUEUE_HEAD            2
 #ifdef CONFIG_SMP
@@ -1032,6 +1025,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq, int cpu);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+#else
+static inline void idle_enter_fair(struct rq *this_rq) {}
+static inline void idle_exit_fair(struct rq *this_rq) {}
+#endif
 #else   /* CONFIG_SMP */
 static inline void idle_balance(int cpu, struct rq *rq)
@@ -1055,45 +1060,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
 extern void update_idle_cpu_load(struct rq *this_rq);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-        struct cgroup_subsys_state css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 __percpu *cpuusage;
-        struct kernel_cpustat __percpu *cpustat;
-};
-extern struct cgroup_subsys cpuacct_subsys;
-extern struct cpuacct root_cpuacct;
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
-        if (!ca || !ca->css.cgroup->parent)
-                return NULL;
-        return cgroup_ca(ca->css.cgroup->parent);
-}
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-#endif
 #ifdef CONFIG_PARAVIRT
 static inline u64 steal_ticks(u64 steal)
 {
@@ -1348,7 +1314,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
 enum rq_nohz_flag_bits {
        NOHZ_TICK_STOPPED,
        NOHZ_BALANCE_KICK,
-        NOHZ_IDLE,
 };
 #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
author	Frederic Weisbecker <fweisbec@gmail.com>	2013-05-02 11:37:49 -0400
committer	Frederic Weisbecker <fweisbec@gmail.com>	2013-05-02 11:54:19 -0400
commit	c032862fba51a3ca504752d3a25186b324c5ce83 (patch)
tree	955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/sched
parent	fda76e074c7737fc57855dd17c762e50ed526052 (diff)
parent	8700c95adb033843fc163d112b9d21d4fda78018 (diff)