1 files changed, 109 insertions, 305 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
 * the target CPU.
 */
 #ifdef CONFIG_SMP
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
 void resched_task(struct task_struct *p)
 {
        int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * In the semi idle case, use the nearest busy cpu for migrating timers
 * from an idle cpu.  This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+        if (tick_nohz_full_cpu(cpu)) {
+                if (cpu != smp_processor_id() ||
+                    tick_nohz_tick_stopped())
+                        smp_send_reschedule(cpu);
+                return true;
+        }
+        return false;
+}
+void wake_up_nohz_cpu(int cpu)
+{
+        if (!wake_up_full_nohz_cpu(cpu))
+                wake_up_idle_cpu(cpu);
+}
 static inline bool got_nohz_idle_kick(void)
 {
        int cpu = smp_processor_id();
        return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 }
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
 static inline bool got_nohz_idle_kick(void)
 {
        return false;
 }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+       struct rq *rq;
+       rq = this_rq();
+       /* Make sure rq->nr_running update is visible after the IPI */
+       smp_rmb();
+       /* More than one running task need preemption */
+       if (rq->nr_running > 1)
+               return false;
+       return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
 void sched_avg_update(struct rq *rq)
 {
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
+        trace_sched_wakeup(p, true);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
-        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+            && !tick_nohz_full_cpu(smp_processor_id()))
                return;
        /*
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void)
         * somewhat pessimize the simple resched case.
         */
        irq_enter();
+        tick_nohz_full_check();
        sched_ttwu_pending();
        /*
@@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        BUG_ON(rq != this_rq());
+        if (WARN_ON_ONCE(rq != this_rq()) ||
-        BUG_ON(p == current);
+            WARN_ON_ONCE(p == current))
+                return;
        lockdep_assert_held(&rq->lock);
        if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
                kprobe_flush_task(prev);
                put_task_struct(prev);
        }
+        tick_nohz_task_switch(current);
 }
 #ifdef CONFIG_SMP
@@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
        return load >> FSHIFT;
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * Handle NO_HZ for the global load-average.
 *
@@ -2347,12 +2384,12 @@ static void calc_global_nohz(void)
        smp_wmb();
        calc_load_idx++;
 }
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
 static inline long calc_load_fold_idle(void) { return 0; }
 static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void)
        }
        raw_spin_unlock(&this_rq->lock);
 }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 /*
 * Called from scheduler_tick()
@@ -2699,8 +2736,35 @@ void scheduler_tick(void)
        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
+        rq_last_tick_reset(rq);
 }
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+        struct rq *rq = this_rq();
+        unsigned long next, now = ACCESS_ONCE(jiffies);
+        next = rq->last_sched_tick + HZ;
+        if (time_before_eq(next, now))
+                return 0;
+        return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
+}
+#endif
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
        if (in_lock_functions(addr)) {
@@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-        if (!sched_feat(OWNER_SPIN))
-                return 0;
-        rcu_read_lock();
-        while (owner_running(lock, owner)) {
-                if (need_resched())
-                        break;
-                arch_mutex_cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
-}
-#endif
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 {
        struct thread_info *ti = current_thread_info();
+        enum ctx_state prev_state;
        /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
-        user_exit();
+        prev_state = exception_enter();
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
@@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
                 */
                barrier();
        } while (need_resched());
+        exception_exit(prev_state);
 }
 #endif /* CONFIG_PREEMPT */
@@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        get_task_struct(p);
        rcu_read_unlock();
+        if (p->flags & PF_NO_SETAFFINITY) {
+                retval = -EINVAL;
+                goto out_put_task;
+        }
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
                goto out_put_task;
@@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
                task_pid_nr(p), ppid,
                (unsigned long)task_thread_info(p)->flags);
+        print_worker_info(KERN_INFO, p);
        show_stack(p, NULL);
 }
@@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                ret = -EINVAL;
-                goto out;
-        }
        do_set_cpus_allowed(p, new_mask);
        /* Can the task run on the task's current CPU? If so, we're done */
@@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
@@ -6248,7 +6271,7 @@ static void sched_init_numa(void)
         * 'level' contains the number of unique distances, excluding the
         * identity distance node_distance(i,i).
         *
-         * The sched_domains_nume_distance[] array includes the actual distance
+         * The sched_domains_numa_distance[] array includes the actual distance
         * numbers.
         */
@@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
 }
 #ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 void __init sched_init(void)
 {
@@ -6902,7 +6929,7 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
-                        per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                        per_cpu(load_balance_mask, i) = (void *)ptr;
                        ptr += cpumask_size();
                }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6928,12 +6955,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-        root_cpuacct.cpustat = &kernel_cpustat;
-        root_cpuacct.cpuusage = alloc_percpu(u64);
-        /* Too early, not expected to fail */
-        BUG_ON(!root_cpuacct.cpuusage);
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -6997,9 +7018,12 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->cfs_tasks);
                rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
                rq->nohz_flags = 0;
 #endif
+#ifdef CONFIG_NO_HZ_FULL
+                rq->last_sched_tick = 0;
+#endif
 #endif
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
@@ -7455,7 +7479,7 @@ unlock:
        return err;
 }
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
        u64 rt_runtime, rt_period;
@@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
 {
        u64 rt_runtime_us;
@@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
        return rt_runtime_us;
 }
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
        u64 rt_runtime, rt_period;
@@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
 {
        u64 rt_period_us;
@@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void)
        return ret;
 }
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
        /* Don't accept realtime tasks when there is no way for them to run */
        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 #endif  /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-struct cpuacct root_cpuacct;
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
-        struct cpuacct *ca;
-        if (!cgrp->parent)
-                return &root_cpuacct.css;
-        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-        if (!ca)
-                goto out;
-        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage)
-                goto out_free_ca;
-        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-        if (!ca->cpustat)
-                goto out_free_cpuusage;
-        return &ca->css;
-out_free_cpuusage:
-        free_percpu(ca->cpuusage);
-out_free_ca:
-        kfree(ca);
-out:
-        return ERR_PTR(-ENOMEM);
-}
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        free_percpu(ca->cpustat);
-        free_percpu(ca->cpuusage);
-        kfree(ca);
-}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-        u64 data;
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        data = *cpuusage;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        data = *cpuusage;
-#endif
-        return data;
-}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        *cpuusage = val;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        *cpuusage = val;
-#endif
-}
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        u64 totalcpuusage = 0;
-        int i;
-        for_each_present_cpu(i)
-                totalcpuusage += cpuacct_cpuusage_read(ca, i);
-        return totalcpuusage;
-}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                                u64 reset)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int err = 0;
-        int i;
-        if (reset) {
-                err = -EINVAL;
-                goto out;
-        }
-        for_each_present_cpu(i)
-                cpuacct_cpuusage_write(ca, i, 0);
-out:
-        return err;
-}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                   struct seq_file *m)
-{
-        struct cpuacct *ca = cgroup_ca(cgroup);
-        u64 percpu;
-        int i;
-        for_each_present_cpu(i) {
-                percpu = cpuacct_cpuusage_read(ca, i);
-                seq_printf(m, "%llu ", (unsigned long long) percpu);
-        }
-        seq_printf(m, "\n");
-        return 0;
-}
-static const char *cpuacct_stat_desc[] = {
-        [CPUACCT_STAT_USER] = "user",
-        [CPUACCT_STAT_SYSTEM] = "system",
-};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                              struct cgroup_map_cb *cb)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int cpu;
-        s64 val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_USER];
-                val += kcpustat->cpustat[CPUTIME_NICE];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_SYSTEM];
-                val += kcpustat->cpustat[CPUTIME_IRQ];
-                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-        return 0;
-}
-static struct cftype files[] = {
-        {
-                .name = "usage",
-                .read_u64 = cpuusage_read,
-                .write_u64 = cpuusage_write,
-        },
-        {
-                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
-        },
-        {
-                .name = "stat",
-                .read_map = cpuacct_stats_show,
-        },
-        { }     /* terminate */
-};
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-        struct cpuacct *ca;
-        int cpu;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        cpu = task_cpu(tsk);
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        for (; ca; ca = parent_ca(ca)) {
-                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-                *cpuusage += cputime;
-        }
-        rcu_read_unlock();
-}
-struct cgroup_subsys cpuacct_subsys = {
-        .name = "cpuacct",
-        .css_alloc = cpuacct_css_alloc,
-        .css_free = cpuacct_css_free,
-        .subsys_id = cpuacct_subsys_id,
-        .base_cftypes = files,
-};
-#endif  /* CONFIG_CGROUP_CPUACCT */
 void dump_cpu_task(int cpu)
 {
        pr_info("Task dump for CPU %d:\n", cpu);