1 files changed, 142 insertions, 89 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6edbef296ece..3c4d096544ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -555,12 +555,15 @@ void resched_cpu(int cpu)
 * selecting an idle cpu will add more delays to the timers than intended
 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
 */
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
 {
        int cpu = smp_processor_id();
        int i;
        struct sched_domain *sd;
+        if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+                return cpu;
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
@@ -823,19 +826,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
        if (static_key_false((&paravirt_steal_rq_enabled))) {
-                u64 st;
                steal = paravirt_steal_clock(cpu_of(rq));
                steal -= rq->prev_steal_time_rq;
                if (unlikely(steal > delta))
                        steal = delta;
-                st = steal_ticks(steal);
-                steal = st * TICK_NSEC;
                rq->prev_steal_time_rq += steal;
                delta -= steal;
        }
 #endif
@@ -1745,8 +1742,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->last_task_numa_placement = 0;
+        p->last_sum_exec_runtime = 0;
        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
@@ -2149,8 +2148,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
-                task_numa_free(prev);
                if (prev->sched_class->task_dead)
                        prev->sched_class->task_dead(prev);
@@ -2167,13 +2164,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-}
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -2191,10 +2181,6 @@ static inline void post_schedule(struct rq *rq)
 #else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2510,8 +2496,13 @@ void __kprobes preempt_count_add(int val)
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
 #endif
-        if (preempt_count() == val)
+        if (preempt_count() == val) {
-                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+                unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+                current->preempt_disable_ip = ip;
+#endif
+                trace_preempt_off(CALLER_ADDR0, ip);
+        }
 }
 EXPORT_SYMBOL(preempt_count_add);
@@ -2554,6 +2545,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+        if (in_atomic_preempt_off()) {
+                pr_err("Preemption disabled at:");
+                print_ip_sym(current->preempt_disable_ip);
+                pr_cont("\n");
+        }
+#endif
        dump_stack();
        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -2577,36 +2575,34 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 }
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->on_rq || rq->skip_clock_update < 0)
-                update_rq_clock(rq);
-        prev->sched_class->put_prev_task(rq, prev);
-}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-        const struct sched_class *class;
+        const struct sched_class *class = &fair_sched_class;
        struct task_struct *p;
        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+        if (likely(prev->sched_class == class &&
-                p = fair_sched_class.pick_next_task(rq);
+                   rq->nr_running == rq->cfs.h_nr_running)) {
-                if (likely(p))
+                p = fair_sched_class.pick_next_task(rq, prev);
+                if (likely(p && p != RETRY_TASK))
                        return p;
        }
+again:
        for_each_class(class) {
-                p = class->pick_next_task(rq);
+                p = class->pick_next_task(rq, prev);
-                if (p)
+                if (p) {
+                        if (unlikely(p == RETRY_TASK))
+                                goto again;
                        return p;
+                }
        }
        BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2696,10 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        pre_schedule(rq, prev);
+        if (prev->on_rq || rq->skip_clock_update < 0)
+                update_rq_clock(rq);
-        if (unlikely(!rq->nr_running))
-                idle_balance(cpu, rq);
-        put_prev_task(rq, prev);
+        next = pick_next_task(rq, prev);
-        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
@@ -2908,7 +2901,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
 * This function changes the 'effective' priority of a task. It does
 * not touch ->normal_prio like __setscheduler().
 *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
@@ -2998,7 +2992,7 @@ void set_user_nice(struct task_struct *p, long nice)
        unsigned long flags;
        struct rq *rq;
-        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+        if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                return;
        /*
         * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3070,11 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = TASK_NICE(current) + increment;
+        nice = task_nice(current) + increment;
-        if (nice < -20)
+        if (nice < MIN_NICE)
-                nice = -20;
+                nice = MIN_NICE;
-        if (nice > 19)
+        if (nice > MAX_NICE)
-                nice = 19;
+                nice = MAX_NICE;
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
@@ -3109,18 +3103,6 @@ int task_prio(const struct task_struct *p)
 }
 /**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
-        return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-/**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
 *
@@ -3189,9 +3171,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_new = 1;
 }
-/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler_params(struct task_struct *p,
-static void __setscheduler(struct rq *rq, struct task_struct *p,
+                const struct sched_attr *attr)
-                           const struct sched_attr *attr)
 {
        int policy = attr->sched_policy;
@@ -3211,9 +3192,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
         * getparam()/getattr() don't report silly values for !rt tasks.
         */
        p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-        p->prio = rt_mutex_getprio(p);
+        set_load_weight(p);
+}
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                           const struct sched_attr *attr)
+{
+        __setscheduler_params(p, attr);
+        /*
+         * If we get here, there was no pi waiters boosting the
+         * task. It is safe to use the normal prio.
+         */
+        p->prio = normal_prio(p);
        if (dl_prio(p->prio))
                p->sched_class = &dl_sched_class;
@@ -3221,8 +3214,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
-        set_load_weight(p);
 }
 static void
@@ -3275,6 +3266,8 @@ static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
 {
+        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+                      MAX_RT_PRIO - 1 - attr->sched_priority;
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        int policy = attr->sched_policy;
        unsigned long flags;
@@ -3319,7 +3312,7 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (fair_policy(policy)) {
-                        if (attr->sched_nice < TASK_NICE(p) &&
+                        if (attr->sched_nice < task_nice(p) &&
                            !can_nice(p, attr->sched_nice))
                                return -EPERM;
                }
@@ -3338,12 +3331,21 @@ recheck:
                                return -EPERM;
                }
+                 /*
+                  * Can't set/change SCHED_DEADLINE policy at all for now
+                  * (safest behavior); in the future we would like to allow
+                  * unprivileged DL tasks to increase their relative deadline
+                  * or reduce their runtime (both ways reducing utilization)
+                  */
+                if (dl_policy(policy))
+                        return -EPERM;
                /*
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        if (!can_nice(p, TASK_NICE(p)))
+                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3380,16 +3382,18 @@ recheck:
        }
        /*
-         * If not changing anything there's no need to proceed further:
+         * If not changing anything there's no need to proceed further,
+         * but store a possible modification of reset_on_fork.
         */
        if (unlikely(policy == p->policy)) {
-                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
                if (dl_policy(policy))
                        goto change;
+                p->sched_reset_on_fork = reset_on_fork;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
@@ -3443,6 +3447,24 @@ change:
                return -EBUSY;
        }
+        p->sched_reset_on_fork = reset_on_fork;
+        oldprio = p->prio;
+        /*
+         * Special case for priority boosted tasks.
+         *
+         * If the new priority is lower or equal (user space view)
+         * than the current (boosted) priority, we just store the new
+         * normal parameters and do not touch the scheduler class and
+         * the runqueue. This will be done when the task deboost
+         * itself.
+         */
+        if (rt_mutex_check_prio(p, newprio)) {
+                __setscheduler_params(p, attr);
+                task_rq_unlock(rq, p, &flags);
+                return 0;
+        }
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -3450,16 +3472,18 @@ change:
        if (running)
                p->sched_class->put_prev_task(rq, p);
-        p->sched_reset_on_fork = reset_on_fork;
-        oldprio = p->prio;
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                /*
+                 * We enqueue to tail when the priority of a task is
+                 * increased (user space view).
+                 */
+                enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+        }
        check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, p, &flags);
@@ -3615,7 +3639,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         * XXX: do we want to be lenient like existing syscalls; or do we want
         * to be strict and return an error on out-of-bounds values?
         */
-        attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 out:
        return ret;
@@ -3836,7 +3860,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else if (task_has_rt_policy(p))
                attr.sched_priority = p->rt_priority;
        else
-                attr.sched_nice = TASK_NICE(p);
+                attr.sched_nice = task_nice(p);
        rcu_read_unlock();
@@ -4474,6 +4498,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
+        idle->on_rq = 1;
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
@@ -4693,8 +4718,10 @@ void idle_task_exit(void)
        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
+        if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
+                finish_arch_post_lock_switch();
+        }
        mmdrop(mm);
 }
@@ -4712,6 +4739,22 @@ static void calc_load_migrate(struct rq *rq)
                atomic_long_add(delta, &calc_load_tasks);
 }
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+static const struct sched_class fake_sched_class = {
+        .put_prev_task = put_prev_task_fake,
+};
+static struct task_struct fake_task = {
+        /*
+         * Avoid pull_{rt,dl}_task()
+         */
+        .prio = MAX_PRIO + 1,
+        .sched_class = &fake_sched_class,
+};
 /*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
@@ -4752,7 +4795,7 @@ static void migrate_tasks(unsigned int dead_cpu)
                if (rq->nr_running == 1)
                        break;
-                next = pick_next_task(rq);
+                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -4842,7 +4885,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(13);
+        struct ctl_table *table = sd_alloc_ctl_entry(14);
        if (table == NULL)
                return NULL;
@@ -4870,9 +4913,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax, false);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "name", sd->name,
+        set_table_entry(&table[11], "max_newidle_lb_cost",
+                &sd->max_newidle_lb_cost,
+                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-        /* &table[12] is terminator */
+        /* &table[13] is terminator */
        return table;
 }
@@ -6849,7 +6895,6 @@ void __init sched_init(void)
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
-                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
@@ -6938,7 +6983,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        static unsigned long prev_jiffy;        /* ratelimiting */
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+             !is_idle_task(current)) ||
            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6956,6 +7002,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+        if (!preempt_count_equals(preempt_offset)) {
+                pr_err("Preemption disabled at:");
+                print_ip_sym(current->preempt_disable_ip);
+                pr_cont("\n");
+        }
+#endif
        dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -7009,7 +7062,7 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (TASK_NICE(p) < 0 && p->mm)
+                        if (task_nice(p) < 0 && p->mm)
                                set_user_nice(p, 0);
                        continue;
                }