summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 14:21:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 14:21:19 -0400
commit971eae7c99212dd67b425a603f1fe3b763359907 (patch)
tree2ff002ecc759275cbecee123a230f90ea7452b18 /kernel
parent8c292f11744297dfb3a69f4a0bccbe4a6417b50d (diff)
parent6037dd1a49f95092824fa8ba75c717ff7805e317 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "Bigger changes: - sched/idle restructuring: they are WIP preparation for deeper integration between the scheduler and idle state selection, by Nicolas Pitre. - add NUMA scheduling pseudo-interleaving, by Rik van Riel. - optimize cgroup context switches, by Peter Zijlstra. - RT scheduling enhancements, by Thomas Gleixner. The rest is smaller changes, non-urgnt fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (68 commits) sched: Clean up the task_hot() function sched: Remove double calculation in fix_small_imbalance() sched: Fix broken setscheduler() sparc64, sched: Remove unused sparc64_multi_core sched: Remove unused mc_capable() and smt_capable() sched/numa: Move task_numa_free() to __put_task_struct() sched/fair: Fix endless loop in idle_balance() sched/core: Fix endless loop in pick_next_task() sched/fair: Push down check for high priority class task into idle_balance() sched/rt: Fix picking RT and DL tasks from empty queue trace: Replace hardcoding of 19 with MAX_NICE sched: Guarantee task priority in pick_next_task() sched/idle: Remove stale old file sched: Put rq's sched_avg under CONFIG_FAIR_GROUP_SCHED cpuidle/arm64: Remove redundant cpuidle_idle_call() cpuidle/powernv: Remove redundant cpuidle_idle_call() sched, nohz: Exclude isolated cores from load balancing sched: Fix select_task_rq_fair() description comments workqueue: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE sys: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/rcu/rcutorture.c8
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/core.c207
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c56
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c600
-rw-r--r--kernel/sched/idle.c (renamed from kernel/cpu/idle.c)7
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c102
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/workqueue.c2
21 files changed, 788 insertions, 350 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5c0e7666811d..4fd847488b76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
22obj-y += locking/ 22obj-y += locking/
23obj-y += power/ 23obj-y += power/
24obj-y += printk/ 24obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 25obj-y += irq/
27obj-y += rcu/ 26obj-y += rcu/
28 27
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..332688e5e7b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
237 WARN_ON(atomic_read(&tsk->usage)); 237 WARN_ON(atomic_read(&tsk->usage));
238 WARN_ON(tsk == current); 238 WARN_ON(tsk == current);
239 239
240 task_numa_free(tsk);
240 security_task_free(tsk); 241 security_task_free(tsk);
241 exit_creds(tsk); 242 exit_creds(tsk);
242 delayacct_tsk_free(tsk); 243 delayacct_tsk_free(tsk);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
213} 213}
214 214
215/* 215/*
216 * Called by sched_setscheduler() to check whether the priority change
217 * is overruled by a possible priority boosting.
218 */
219int rt_mutex_check_prio(struct task_struct *task, int newprio)
220{
221 if (!task_has_pi_waiters(task))
222 return 0;
223
224 return task_top_pi_waiter(task)->task->prio <= newprio;
225}
226
227/*
216 * Adjust the priority of a task, after its pi_waiters got modified. 228 * Adjust the priority of a task, after its pi_waiters got modified.
217 * 229 *
218 * This can be both boosting and unboosting. task->pi_lock must be held. 230 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f59d48597dde..bd30bc61bc05 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -696,7 +696,7 @@ rcu_torture_writer(void *arg)
696 static DEFINE_TORTURE_RANDOM(rand); 696 static DEFINE_TORTURE_RANDOM(rand);
697 697
698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
699 set_user_nice(current, 19); 699 set_user_nice(current, MAX_NICE);
700 700
701 do { 701 do {
702 schedule_timeout_uninterruptible(1); 702 schedule_timeout_uninterruptible(1);
@@ -759,7 +759,7 @@ rcu_torture_fakewriter(void *arg)
759 DEFINE_TORTURE_RANDOM(rand); 759 DEFINE_TORTURE_RANDOM(rand);
760 760
761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); 761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
762 set_user_nice(current, 19); 762 set_user_nice(current, MAX_NICE);
763 763
764 do { 764 do {
765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); 765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
@@ -872,7 +872,7 @@ rcu_torture_reader(void *arg)
872 unsigned long long ts; 872 unsigned long long ts;
873 873
874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); 874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
875 set_user_nice(current, 19); 875 set_user_nice(current, MAX_NICE);
876 if (irqreader && cur_ops->irq_capable) 876 if (irqreader && cur_ops->irq_capable)
877 setup_timer_on_stack(&t, rcu_torture_timer, 0); 877 setup_timer_on_stack(&t, rcu_torture_timer, 0);
878 878
@@ -1161,7 +1161,7 @@ static int rcu_torture_barrier_cbs(void *arg)
1161 1161
1162 init_rcu_head_on_stack(&rcu); 1162 init_rcu_head_on_stack(&rcu);
1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started"); 1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
1164 set_user_nice(current, 19); 1164 set_user_nice(current, MAX_NICE);
1165 do { 1165 do {
1166 wait_event(barrier_cbs_wq[myid], 1166 wait_event(barrier_cbs_wq[myid],
1167 (newphase = 1167 (newphase =
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
203 struct autogroup *ag; 203 struct autogroup *ag;
204 int err; 204 int err;
205 205
206 if (nice < -20 || nice > 19) 206 if (nice < MIN_NICE || nice > MAX_NICE)
207 return -EINVAL; 207 return -EINVAL;
208 208
209 err = security_task_setnice(current, nice); 209 err = security_task_setnice(current, nice);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5c6635b806c..ae365aaa8181 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1748 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1749 p->numa_faults_buffer_memory = NULL;
1750 p->last_task_numa_placement = 0;
1751 p->last_sum_exec_runtime = 0;
1750 1752
1751 INIT_LIST_HEAD(&p->numa_entry); 1753 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1754 p->numa_group = NULL;
@@ -2149,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2149 if (mm) 2151 if (mm)
2150 mmdrop(mm); 2152 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2153 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead) 2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev); 2155 prev->sched_class->task_dead(prev);
2156 2156
@@ -2167,13 +2167,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2167
2168#ifdef CONFIG_SMP 2168#ifdef CONFIG_SMP
2169 2169
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2170/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2171static inline void post_schedule(struct rq *rq)
2179{ 2172{
@@ -2191,10 +2184,6 @@ static inline void post_schedule(struct rq *rq)
2191 2184
2192#else 2185#else
2193 2186
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2187static inline void post_schedule(struct rq *rq)
2199{ 2188{
2200} 2189}
@@ -2510,8 +2499,13 @@ void __kprobes preempt_count_add(int val)
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2499 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10); 2500 PREEMPT_MASK - 10);
2512#endif 2501#endif
2513 if (preempt_count() == val) 2502 if (preempt_count() == val) {
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2503 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2504#ifdef CONFIG_DEBUG_PREEMPT
2505 current->preempt_disable_ip = ip;
2506#endif
2507 trace_preempt_off(CALLER_ADDR0, ip);
2508 }
2515} 2509}
2516EXPORT_SYMBOL(preempt_count_add); 2510EXPORT_SYMBOL(preempt_count_add);
2517 2511
@@ -2554,6 +2548,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2554 print_modules(); 2548 print_modules();
2555 if (irqs_disabled()) 2549 if (irqs_disabled())
2556 print_irqtrace_events(prev); 2550 print_irqtrace_events(prev);
2551#ifdef CONFIG_DEBUG_PREEMPT
2552 if (in_atomic_preempt_off()) {
2553 pr_err("Preemption disabled at:");
2554 print_ip_sym(current->preempt_disable_ip);
2555 pr_cont("\n");
2556 }
2557#endif
2557 dump_stack(); 2558 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2559 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559} 2560}
@@ -2577,36 +2578,34 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2578 schedstat_inc(this_rq(), sched_count);
2578} 2579}
2579 2580
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2581/*
2588 * Pick up the highest-prio task: 2582 * Pick up the highest-prio task:
2589 */ 2583 */
2590static inline struct task_struct * 2584static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2585pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2586{
2593 const struct sched_class *class; 2587 const struct sched_class *class = &fair_sched_class;
2594 struct task_struct *p; 2588 struct task_struct *p;
2595 2589
2596 /* 2590 /*
2597 * Optimization: we know that if all tasks are in 2591 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2592 * the fair class we can call that function directly:
2599 */ 2593 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2594 if (likely(prev->sched_class == class &&
2601 p = fair_sched_class.pick_next_task(rq); 2595 rq->nr_running == rq->cfs.h_nr_running)) {
2602 if (likely(p)) 2596 p = fair_sched_class.pick_next_task(rq, prev);
2597 if (likely(p && p != RETRY_TASK))
2603 return p; 2598 return p;
2604 } 2599 }
2605 2600
2601again:
2606 for_each_class(class) { 2602 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2603 p = class->pick_next_task(rq, prev);
2608 if (p) 2604 if (p) {
2605 if (unlikely(p == RETRY_TASK))
2606 goto again;
2609 return p; 2607 return p;
2608 }
2610 } 2609 }
2611 2610
2612 BUG(); /* the idle class will always have a runnable task */ 2611 BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2699,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2699 switch_count = &prev->nvcsw;
2701 } 2700 }
2702 2701
2703 pre_schedule(rq, prev); 2702 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2703 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2704
2708 put_prev_task(rq, prev); 2705 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2706 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2707 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2708 rq->skip_clock_update = 0;
@@ -2908,7 +2904,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
2908 * This function changes the 'effective' priority of a task. It does 2904 * This function changes the 'effective' priority of a task. It does
2909 * not touch ->normal_prio like __setscheduler(). 2905 * not touch ->normal_prio like __setscheduler().
2910 * 2906 *
2911 * Used by the rt_mutex code to implement priority inheritance logic. 2907 * Used by the rt_mutex code to implement priority inheritance
2908 * logic. Call site only calls if the priority of the task changed.
2912 */ 2909 */
2913void rt_mutex_setprio(struct task_struct *p, int prio) 2910void rt_mutex_setprio(struct task_struct *p, int prio)
2914{ 2911{
@@ -2998,7 +2995,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2995 unsigned long flags;
2999 struct rq *rq; 2996 struct rq *rq;
3000 2997
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2998 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3002 return; 2999 return;
3003 /* 3000 /*
3004 * We have to be careful, if called from sys_setpriority(), 3001 * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3073,11 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3073 if (increment > 40)
3077 increment = 40; 3074 increment = 40;
3078 3075
3079 nice = TASK_NICE(current) + increment; 3076 nice = task_nice(current) + increment;
3080 if (nice < -20) 3077 if (nice < MIN_NICE)
3081 nice = -20; 3078 nice = MIN_NICE;
3082 if (nice > 19) 3079 if (nice > MAX_NICE)
3083 nice = 19; 3080 nice = MAX_NICE;
3084 3081
3085 if (increment < 0 && !can_nice(current, nice)) 3082 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM; 3083 return -EPERM;
@@ -3109,18 +3106,6 @@ int task_prio(const struct task_struct *p)
3109} 3106}
3110 3107
3111/** 3108/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3109 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3110 * @cpu: the processor in question.
3126 * 3111 *
@@ -3189,9 +3174,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3189 dl_se->dl_new = 1; 3174 dl_se->dl_new = 1;
3190} 3175}
3191 3176
3192/* Actually do priority change: must hold pi & rq lock. */ 3177static void __setscheduler_params(struct task_struct *p,
3193static void __setscheduler(struct rq *rq, struct task_struct *p, 3178 const struct sched_attr *attr)
3194 const struct sched_attr *attr)
3195{ 3179{
3196 int policy = attr->sched_policy; 3180 int policy = attr->sched_policy;
3197 3181
@@ -3211,9 +3195,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3211 * getparam()/getattr() don't report silly values for !rt tasks. 3195 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */ 3196 */
3213 p->rt_priority = attr->sched_priority; 3197 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p); 3198 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p); 3199 set_load_weight(p);
3200}
3201
3202/* Actually do priority change: must hold pi & rq lock. */
3203static void __setscheduler(struct rq *rq, struct task_struct *p,
3204 const struct sched_attr *attr)
3205{
3206 __setscheduler_params(p, attr);
3207
3208 /*
3209 * If we get here, there was no pi waiters boosting the
3210 * task. It is safe to use the normal prio.
3211 */
3212 p->prio = normal_prio(p);
3217 3213
3218 if (dl_prio(p->prio)) 3214 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class; 3215 p->sched_class = &dl_sched_class;
@@ -3221,8 +3217,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3221 p->sched_class = &rt_sched_class; 3217 p->sched_class = &rt_sched_class;
3222 else 3218 else
3223 p->sched_class = &fair_sched_class; 3219 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226} 3220}
3227 3221
3228static void 3222static void
@@ -3275,6 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr, 3269 const struct sched_attr *attr,
3276 bool user) 3270 bool user)
3277{ 3271{
3272 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3273 MAX_RT_PRIO - 1 - attr->sched_priority;
3278 int retval, oldprio, oldpolicy = -1, on_rq, running; 3274 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy; 3275 int policy = attr->sched_policy;
3280 unsigned long flags; 3276 unsigned long flags;
@@ -3319,7 +3315,7 @@ recheck:
3319 */ 3315 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3316 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3317 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3318 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3319 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3320 return -EPERM;
3325 } 3321 }
@@ -3352,7 +3348,7 @@ recheck:
3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3348 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3353 */ 3349 */
3354 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3350 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3355 if (!can_nice(p, TASK_NICE(p))) 3351 if (!can_nice(p, task_nice(p)))
3356 return -EPERM; 3352 return -EPERM;
3357 } 3353 }
3358 3354
@@ -3389,16 +3385,18 @@ recheck:
3389 } 3385 }
3390 3386
3391 /* 3387 /*
3392 * If not changing anything there's no need to proceed further: 3388 * If not changing anything there's no need to proceed further,
3389 * but store a possible modification of reset_on_fork.
3393 */ 3390 */
3394 if (unlikely(policy == p->policy)) { 3391 if (unlikely(policy == p->policy)) {
3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3392 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3396 goto change; 3393 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3394 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change; 3395 goto change;
3399 if (dl_policy(policy)) 3396 if (dl_policy(policy))
3400 goto change; 3397 goto change;
3401 3398
3399 p->sched_reset_on_fork = reset_on_fork;
3402 task_rq_unlock(rq, p, &flags); 3400 task_rq_unlock(rq, p, &flags);
3403 return 0; 3401 return 0;
3404 } 3402 }
@@ -3452,6 +3450,24 @@ change:
3452 return -EBUSY; 3450 return -EBUSY;
3453 } 3451 }
3454 3452
3453 p->sched_reset_on_fork = reset_on_fork;
3454 oldprio = p->prio;
3455
3456 /*
3457 * Special case for priority boosted tasks.
3458 *
3459 * If the new priority is lower or equal (user space view)
3460 * than the current (boosted) priority, we just store the new
3461 * normal parameters and do not touch the scheduler class and
3462 * the runqueue. This will be done when the task deboost
3463 * itself.
3464 */
3465 if (rt_mutex_check_prio(p, newprio)) {
3466 __setscheduler_params(p, attr);
3467 task_rq_unlock(rq, p, &flags);
3468 return 0;
3469 }
3470
3455 on_rq = p->on_rq; 3471 on_rq = p->on_rq;
3456 running = task_current(rq, p); 3472 running = task_current(rq, p);
3457 if (on_rq) 3473 if (on_rq)
@@ -3459,16 +3475,18 @@ change:
3459 if (running) 3475 if (running)
3460 p->sched_class->put_prev_task(rq, p); 3476 p->sched_class->put_prev_task(rq, p);
3461 3477
3462 p->sched_reset_on_fork = reset_on_fork;
3463
3464 oldprio = p->prio;
3465 prev_class = p->sched_class; 3478 prev_class = p->sched_class;
3466 __setscheduler(rq, p, attr); 3479 __setscheduler(rq, p, attr);
3467 3480
3468 if (running) 3481 if (running)
3469 p->sched_class->set_curr_task(rq); 3482 p->sched_class->set_curr_task(rq);
3470 if (on_rq) 3483 if (on_rq) {
3471 enqueue_task(rq, p, 0); 3484 /*
3485 * We enqueue to tail when the priority of a task is
3486 * increased (user space view).
3487 */
3488 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3489 }
3472 3490
3473 check_class_changed(rq, p, prev_class, oldprio); 3491 check_class_changed(rq, p, prev_class, oldprio);
3474 task_rq_unlock(rq, p, &flags); 3492 task_rq_unlock(rq, p, &flags);
@@ -3624,7 +3642,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3624 * XXX: do we want to be lenient like existing syscalls; or do we want 3642 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values? 3643 * to be strict and return an error on out-of-bounds values?
3626 */ 3644 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3645 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3628 3646
3629out: 3647out:
3630 return ret; 3648 return ret;
@@ -3845,7 +3863,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3845 else if (task_has_rt_policy(p)) 3863 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority; 3864 attr.sched_priority = p->rt_priority;
3847 else 3865 else
3848 attr.sched_nice = TASK_NICE(p); 3866 attr.sched_nice = task_nice(p);
3849 3867
3850 rcu_read_unlock(); 3868 rcu_read_unlock();
3851 3869
@@ -4483,6 +4501,7 @@ void init_idle(struct task_struct *idle, int cpu)
4483 rcu_read_unlock(); 4501 rcu_read_unlock();
4484 4502
4485 rq->curr = rq->idle = idle; 4503 rq->curr = rq->idle = idle;
4504 idle->on_rq = 1;
4486#if defined(CONFIG_SMP) 4505#if defined(CONFIG_SMP)
4487 idle->on_cpu = 1; 4506 idle->on_cpu = 1;
4488#endif 4507#endif
@@ -4721,6 +4740,22 @@ static void calc_load_migrate(struct rq *rq)
4721 atomic_long_add(delta, &calc_load_tasks); 4740 atomic_long_add(delta, &calc_load_tasks);
4722} 4741}
4723 4742
4743static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4744{
4745}
4746
4747static const struct sched_class fake_sched_class = {
4748 .put_prev_task = put_prev_task_fake,
4749};
4750
4751static struct task_struct fake_task = {
4752 /*
4753 * Avoid pull_{rt,dl}_task()
4754 */
4755 .prio = MAX_PRIO + 1,
4756 .sched_class = &fake_sched_class,
4757};
4758
4724/* 4759/*
4725 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4760 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4726 * try_to_wake_up()->select_task_rq(). 4761 * try_to_wake_up()->select_task_rq().
@@ -4761,7 +4796,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4761 if (rq->nr_running == 1) 4796 if (rq->nr_running == 1)
4762 break; 4797 break;
4763 4798
4764 next = pick_next_task(rq); 4799 next = pick_next_task(rq, &fake_task);
4765 BUG_ON(!next); 4800 BUG_ON(!next);
4766 next->sched_class->put_prev_task(rq, next); 4801 next->sched_class->put_prev_task(rq, next);
4767 4802
@@ -4851,7 +4886,7 @@ set_table_entry(struct ctl_table *entry,
4851static struct ctl_table * 4886static struct ctl_table *
4852sd_alloc_ctl_domain_table(struct sched_domain *sd) 4887sd_alloc_ctl_domain_table(struct sched_domain *sd)
4853{ 4888{
4854 struct ctl_table *table = sd_alloc_ctl_entry(13); 4889 struct ctl_table *table = sd_alloc_ctl_entry(14);
4855 4890
4856 if (table == NULL) 4891 if (table == NULL)
4857 return NULL; 4892 return NULL;
@@ -4879,9 +4914,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4879 sizeof(int), 0644, proc_dointvec_minmax, false); 4914 sizeof(int), 0644, proc_dointvec_minmax, false);
4880 set_table_entry(&table[10], "flags", &sd->flags, 4915 set_table_entry(&table[10], "flags", &sd->flags,
4881 sizeof(int), 0644, proc_dointvec_minmax, false); 4916 sizeof(int), 0644, proc_dointvec_minmax, false);
4882 set_table_entry(&table[11], "name", sd->name, 4917 set_table_entry(&table[11], "max_newidle_lb_cost",
4918 &sd->max_newidle_lb_cost,
4919 sizeof(long), 0644, proc_doulongvec_minmax, false);
4920 set_table_entry(&table[12], "name", sd->name,
4883 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4921 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4884 /* &table[12] is terminator */ 4922 /* &table[13] is terminator */
4885 4923
4886 return table; 4924 return table;
4887} 4925}
@@ -6858,7 +6896,6 @@ void __init sched_init(void)
6858 6896
6859 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6897 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6860#ifdef CONFIG_RT_GROUP_SCHED 6898#ifdef CONFIG_RT_GROUP_SCHED
6861 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6862 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6899 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6863#endif 6900#endif
6864 6901
@@ -6947,7 +6984,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6947 static unsigned long prev_jiffy; /* ratelimiting */ 6984 static unsigned long prev_jiffy; /* ratelimiting */
6948 6985
6949 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6986 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6950 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6987 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6988 !is_idle_task(current)) ||
6951 system_state != SYSTEM_RUNNING || oops_in_progress) 6989 system_state != SYSTEM_RUNNING || oops_in_progress)
6952 return; 6990 return;
6953 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6991 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6965,6 +7003,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6965 debug_show_held_locks(current); 7003 debug_show_held_locks(current);
6966 if (irqs_disabled()) 7004 if (irqs_disabled())
6967 print_irqtrace_events(current); 7005 print_irqtrace_events(current);
7006#ifdef CONFIG_DEBUG_PREEMPT
7007 if (!preempt_count_equals(preempt_offset)) {
7008 pr_err("Preemption disabled at:");
7009 print_ip_sym(current->preempt_disable_ip);
7010 pr_cont("\n");
7011 }
7012#endif
6968 dump_stack(); 7013 dump_stack();
6969} 7014}
6970EXPORT_SYMBOL(__might_sleep); 7015EXPORT_SYMBOL(__might_sleep);
@@ -7018,7 +7063,7 @@ void normalize_rt_tasks(void)
7018 * Renice negative nice level userspace 7063 * Renice negative nice level userspace
7019 * tasks back to 0: 7064 * tasks back to 0:
7020 */ 7065 */
7021 if (TASK_NICE(p) < 0 && p->mm) 7066 if (task_nice(p) < 0 && p->mm)
7022 set_user_nice(p, 0); 7067 set_user_nice(p, 0);
7023 continue; 7068 continue;
7024 } 7069 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6e79b3faa4cd..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -210,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
210 210
211static int push_dl_task(struct rq *rq); 211static int push_dl_task(struct rq *rq);
212 212
213static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
214{
215 return dl_task(prev);
216}
217
218static inline void set_post_schedule(struct rq *rq)
219{
220 rq->post_schedule = has_pushable_dl_tasks(rq);
221}
222
213#else 223#else
214 224
215static inline 225static inline
@@ -232,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
232{ 242{
233} 243}
234 244
245static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
246{
247 return false;
248}
249
250static inline int pull_dl_task(struct rq *rq)
251{
252 return 0;
253}
254
255static inline void set_post_schedule(struct rq *rq)
256{
257}
235#endif /* CONFIG_SMP */ 258#endif /* CONFIG_SMP */
236 259
237static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 260static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -586,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
586 * approach need further study. 609 * approach need further study.
587 */ 610 */
588 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 611 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
589 if (unlikely((s64)delta_exec < 0)) 612 if (unlikely((s64)delta_exec <= 0))
590 delta_exec = 0; 613 return;
591 614
592 schedstat_set(curr->se.statistics.exec_max, 615 schedstat_set(curr->se.statistics.exec_max,
593 max(curr->se.statistics.exec_max, delta_exec)); 616 max(curr->se.statistics.exec_max, delta_exec));
@@ -942,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
942 resched_task(rq->curr); 965 resched_task(rq->curr);
943} 966}
944 967
968static int pull_dl_task(struct rq *this_rq);
969
945#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
946 971
947/* 972/*
@@ -988,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
988 return rb_entry(left, struct sched_dl_entity, rb_node); 1013 return rb_entry(left, struct sched_dl_entity, rb_node);
989} 1014}
990 1015
991struct task_struct *pick_next_task_dl(struct rq *rq) 1016struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
992{ 1017{
993 struct sched_dl_entity *dl_se; 1018 struct sched_dl_entity *dl_se;
994 struct task_struct *p; 1019 struct task_struct *p;
@@ -996,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
996 1021
997 dl_rq = &rq->dl; 1022 dl_rq = &rq->dl;
998 1023
1024 if (need_pull_dl_task(rq, prev))
1025 pull_dl_task(rq);
1026 /*
1027 * When prev is DL, we may throttle it in put_prev_task().
1028 * So, we update time before we check for dl_nr_running.
1029 */
1030 if (prev->sched_class == &dl_sched_class)
1031 update_curr_dl(rq);
1032
999 if (unlikely(!dl_rq->dl_nr_running)) 1033 if (unlikely(!dl_rq->dl_nr_running))
1000 return NULL; 1034 return NULL;
1001 1035
1036 put_prev_task(rq, prev);
1037
1002 dl_se = pick_next_dl_entity(rq, dl_rq); 1038 dl_se = pick_next_dl_entity(rq, dl_rq);
1003 BUG_ON(!dl_se); 1039 BUG_ON(!dl_se);
1004 1040
@@ -1013,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
1013 start_hrtick_dl(rq, p); 1049 start_hrtick_dl(rq, p);
1014#endif 1050#endif
1015 1051
1016#ifdef CONFIG_SMP 1052 set_post_schedule(rq);
1017 rq->post_schedule = has_pushable_dl_tasks(rq);
1018#endif /* CONFIG_SMP */
1019 1053
1020 return p; 1054 return p;
1021} 1055}
@@ -1424,13 +1458,6 @@ skip:
1424 return ret; 1458 return ret;
1425} 1459}
1426 1460
1427static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1428{
1429 /* Try to pull other tasks here */
1430 if (dl_task(prev))
1431 pull_dl_task(rq);
1432}
1433
1434static void post_schedule_dl(struct rq *rq) 1461static void post_schedule_dl(struct rq *rq)
1435{ 1462{
1436 push_dl_tasks(rq); 1463 push_dl_tasks(rq);
@@ -1558,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1558 if (unlikely(p->dl.dl_throttled)) 1585 if (unlikely(p->dl.dl_throttled))
1559 return; 1586 return;
1560 1587
1561 if (p->on_rq || rq->curr != p) { 1588 if (p->on_rq && rq->curr != p) {
1562#ifdef CONFIG_SMP 1589#ifdef CONFIG_SMP
1563 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1590 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1564 /* Only reschedule if pushing failed */ 1591 /* Only reschedule if pushing failed */
@@ -1623,7 +1650,6 @@ const struct sched_class dl_sched_class = {
1623 .set_cpus_allowed = set_cpus_allowed_dl, 1650 .set_cpus_allowed = set_cpus_allowed_dl,
1624 .rq_online = rq_online_dl, 1651 .rq_online = rq_online_dl,
1625 .rq_offline = rq_offline_dl, 1652 .rq_offline = rq_offline_dl,
1626 .pre_schedule = pre_schedule_dl,
1627 .post_schedule = post_schedule_dl, 1653 .post_schedule = post_schedule_dl,
1628 .task_woken = task_woken_dl, 1654 .task_woken = task_woken_dl,
1629#endif 1655#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
321 P(sched_goidle); 321 P(sched_goidle);
322#ifdef CONFIG_SMP 322#ifdef CONFIG_SMP
323 P64(avg_idle); 323 P64(avg_idle);
324 P64(max_idle_balance_cost);
324#endif 325#endif
325 326
326 P(ttwu_count); 327 P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 534 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 535 int cpu_current, home_node;
535 536
536 if (p->numa_faults) 537 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 538 nr_faults = p->numa_faults_memory[2*node + i];
538 539
539 cpu_current = !i ? (task_node(p) == node) : 540 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 541 (pol && node_isset(node, pol->v.nodes));
541 542
542 home_node = (p->numa_preferred_nid == node); 543 home_node = (p->numa_preferred_nid == node);
543 544
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 545 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 546 i, node, cpu_current, home_node, nr_faults);
546 } 547 }
547 } 548 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b4c4f320130..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -2219,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2219 se->avg.load_avg_contrib >>= NICE_0_SHIFT; 2376 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2220 } 2377 }
2221} 2378}
2222#else 2379
2380static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2381{
2382 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2383 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2384}
2385#else /* CONFIG_FAIR_GROUP_SCHED */
2223static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, 2386static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2224 int force_update) {} 2387 int force_update) {}
2225static inline void __update_tg_runnable_avg(struct sched_avg *sa, 2388static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2226 struct cfs_rq *cfs_rq) {} 2389 struct cfs_rq *cfs_rq) {}
2227static inline void __update_group_entity_contrib(struct sched_entity *se) {} 2390static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2228#endif 2391static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2392#endif /* CONFIG_FAIR_GROUP_SCHED */
2229 2393
2230static inline void __update_task_entity_contrib(struct sched_entity *se) 2394static inline void __update_task_entity_contrib(struct sched_entity *se)
2231{ 2395{
@@ -2323,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2323 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); 2487 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2324} 2488}
2325 2489
2326static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2327{
2328 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2329 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2330}
2331
2332/* Add the load generated by se into cfs_rq's child load-average */ 2490/* Add the load generated by se into cfs_rq's child load-average */
2333static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2491static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2334 struct sched_entity *se, 2492 struct sched_entity *se,
@@ -2416,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
2416 update_rq_runnable_avg(this_rq, 0); 2574 update_rq_runnable_avg(this_rq, 0);
2417} 2575}
2418 2576
2419#else 2577static int idle_balance(struct rq *this_rq);
2578
2579#else /* CONFIG_SMP */
2580
2420static inline void update_entity_load_avg(struct sched_entity *se, 2581static inline void update_entity_load_avg(struct sched_entity *se,
2421 int update_cfs_rq) {} 2582 int update_cfs_rq) {}
2422static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2583static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2428,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2428 int sleep) {} 2589 int sleep) {}
2429static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2590static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2430 int force_update) {} 2591 int force_update) {}
2431#endif 2592
2593static inline int idle_balance(struct rq *rq)
2594{
2595 return 0;
2596}
2597
2598#endif /* CONFIG_SMP */
2432 2599
2433static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2600static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2434{ 2601{
@@ -2578,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2578{ 2745{
2579 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2580 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2581 if (cfs_rq->last == se) 2748 if (cfs_rq->last != se)
2582 cfs_rq->last = NULL;
2583 else
2584 break; 2749 break;
2750
2751 cfs_rq->last = NULL;
2585 } 2752 }
2586} 2753}
2587 2754
@@ -2589,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2589{ 2756{
2590 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2591 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2592 if (cfs_rq->next == se) 2759 if (cfs_rq->next != se)
2593 cfs_rq->next = NULL;
2594 else
2595 break; 2760 break;
2761
2762 cfs_rq->next = NULL;
2596 } 2763 }
2597} 2764}
2598 2765
@@ -2600,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2600{ 2767{
2601 for_each_sched_entity(se) { 2768 for_each_sched_entity(se) {
2602 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2769 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2603 if (cfs_rq->skip == se) 2770 if (cfs_rq->skip != se)
2604 cfs_rq->skip = NULL;
2605 else
2606 break; 2771 break;
2772
2773 cfs_rq->skip = NULL;
2607 } 2774 }
2608} 2775}
2609 2776
@@ -2746,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2746 * 3) pick the "last" process, for cache locality 2913 * 3) pick the "last" process, for cache locality
2747 * 4) do not run the "skip" process, if something else is available 2914 * 4) do not run the "skip" process, if something else is available
2748 */ 2915 */
2749static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2916static struct sched_entity *
2917pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2750{ 2918{
2751 struct sched_entity *se = __pick_first_entity(cfs_rq); 2919 struct sched_entity *left = __pick_first_entity(cfs_rq);
2752 struct sched_entity *left = se; 2920 struct sched_entity *se;
2921
2922 /*
2923 * If curr is set we have to see if its left of the leftmost entity
2924 * still in the tree, provided there was anything in the tree at all.
2925 */
2926 if (!left || (curr && entity_before(curr, left)))
2927 left = curr;
2928
2929 se = left; /* ideally we run the leftmost entity */
2753 2930
2754 /* 2931 /*
2755 * Avoid running the skip buddy, if running something else can 2932 * Avoid running the skip buddy, if running something else can
2756 * be done without getting too unfair. 2933 * be done without getting too unfair.
2757 */ 2934 */
2758 if (cfs_rq->skip == se) { 2935 if (cfs_rq->skip == se) {
2759 struct sched_entity *second = __pick_next_entity(se); 2936 struct sched_entity *second;
2937
2938 if (se == curr) {
2939 second = __pick_first_entity(cfs_rq);
2940 } else {
2941 second = __pick_next_entity(se);
2942 if (!second || (curr && entity_before(curr, second)))
2943 second = curr;
2944 }
2945
2760 if (second && wakeup_preempt_entity(second, left) < 1) 2946 if (second && wakeup_preempt_entity(second, left) < 1)
2761 se = second; 2947 se = second;
2762 } 2948 }
@@ -2778,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2778 return se; 2964 return se;
2779} 2965}
2780 2966
2781static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2967static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2782 2968
2783static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2969static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2784{ 2970{
@@ -3433,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3433} 3619}
3434 3620
3435/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3621/* conditionally throttle active cfs_rq's from put_prev_entity() */
3436static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3622static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3437{ 3623{
3438 if (!cfs_bandwidth_used()) 3624 if (!cfs_bandwidth_used())
3439 return; 3625 return false;
3440 3626
3441 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3627 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3442 return; 3628 return false;
3443 3629
3444 /* 3630 /*
3445 * it's possible for a throttled entity to be forced into a running 3631 * it's possible for a throttled entity to be forced into a running
3446 * state (e.g. set_curr_task), in this case we're finished. 3632 * state (e.g. set_curr_task), in this case we're finished.
3447 */ 3633 */
3448 if (cfs_rq_throttled(cfs_rq)) 3634 if (cfs_rq_throttled(cfs_rq))
3449 return; 3635 return true;
3450 3636
3451 throttle_cfs_rq(cfs_rq); 3637 throttle_cfs_rq(cfs_rq);
3638 return true;
3452} 3639}
3453 3640
3454static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3641static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3558,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3558} 3745}
3559 3746
3560static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3747static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3561static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3748static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3562static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3749static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3563static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3750static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3564 3751
@@ -4213,13 +4400,14 @@ done:
4213} 4400}
4214 4401
4215/* 4402/*
4216 * sched_balance_self: balance the current task (running on cpu) in domains 4403 * select_task_rq_fair: Select target runqueue for the waking task in domains
4217 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 4404 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4218 * SD_BALANCE_EXEC. 4405 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4219 * 4406 *
4220 * Balance, ie. select the least loaded group. 4407 * Balances load by selecting the idlest cpu in the idlest group, or under
4408 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4221 * 4409 *
4222 * Returns the target CPU number, or the same CPU if no balancing is needed. 4410 * Returns the target cpu number.
4223 * 4411 *
4224 * preempt must be disabled. 4412 * preempt must be disabled.
4225 */ 4413 */
@@ -4494,26 +4682,124 @@ preempt:
4494 set_last_buddy(se); 4682 set_last_buddy(se);
4495} 4683}
4496 4684
4497static struct task_struct *pick_next_task_fair(struct rq *rq) 4685static struct task_struct *
4686pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4498{ 4687{
4499 struct task_struct *p;
4500 struct cfs_rq *cfs_rq = &rq->cfs; 4688 struct cfs_rq *cfs_rq = &rq->cfs;
4501 struct sched_entity *se; 4689 struct sched_entity *se;
4690 struct task_struct *p;
4691 int new_tasks;
4502 4692
4693again:
4694#ifdef CONFIG_FAIR_GROUP_SCHED
4503 if (!cfs_rq->nr_running) 4695 if (!cfs_rq->nr_running)
4504 return NULL; 4696 goto idle;
4697
4698 if (prev->sched_class != &fair_sched_class)
4699 goto simple;
4700
4701 /*
4702 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4703 * likely that a next task is from the same cgroup as the current.
4704 *
4705 * Therefore attempt to avoid putting and setting the entire cgroup
4706 * hierarchy, only change the part that actually changes.
4707 */
4708
4709 do {
4710 struct sched_entity *curr = cfs_rq->curr;
4711
4712 /*
4713 * Since we got here without doing put_prev_entity() we also
4714 * have to consider cfs_rq->curr. If it is still a runnable
4715 * entity, update_curr() will update its vruntime, otherwise
4716 * forget we've ever seen it.
4717 */
4718 if (curr && curr->on_rq)
4719 update_curr(cfs_rq);
4720 else
4721 curr = NULL;
4722
4723 /*
4724 * This call to check_cfs_rq_runtime() will do the throttle and
4725 * dequeue its entity in the parent(s). Therefore the 'simple'
4726 * nr_running test will indeed be correct.
4727 */
4728 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4729 goto simple;
4730
4731 se = pick_next_entity(cfs_rq, curr);
4732 cfs_rq = group_cfs_rq(se);
4733 } while (cfs_rq);
4734
4735 p = task_of(se);
4736
4737 /*
4738 * Since we haven't yet done put_prev_entity and if the selected task
4739 * is a different task than we started out with, try and touch the
4740 * least amount of cfs_rqs.
4741 */
4742 if (prev != p) {
4743 struct sched_entity *pse = &prev->se;
4744
4745 while (!(cfs_rq = is_same_group(se, pse))) {
4746 int se_depth = se->depth;
4747 int pse_depth = pse->depth;
4748
4749 if (se_depth <= pse_depth) {
4750 put_prev_entity(cfs_rq_of(pse), pse);
4751 pse = parent_entity(pse);
4752 }
4753 if (se_depth >= pse_depth) {
4754 set_next_entity(cfs_rq_of(se), se);
4755 se = parent_entity(se);
4756 }
4757 }
4758
4759 put_prev_entity(cfs_rq, pse);
4760 set_next_entity(cfs_rq, se);
4761 }
4762
4763 if (hrtick_enabled(rq))
4764 hrtick_start_fair(rq, p);
4765
4766 return p;
4767simple:
4768 cfs_rq = &rq->cfs;
4769#endif
4770
4771 if (!cfs_rq->nr_running)
4772 goto idle;
4773
4774 put_prev_task(rq, prev);
4505 4775
4506 do { 4776 do {
4507 se = pick_next_entity(cfs_rq); 4777 se = pick_next_entity(cfs_rq, NULL);
4508 set_next_entity(cfs_rq, se); 4778 set_next_entity(cfs_rq, se);
4509 cfs_rq = group_cfs_rq(se); 4779 cfs_rq = group_cfs_rq(se);
4510 } while (cfs_rq); 4780 } while (cfs_rq);
4511 4781
4512 p = task_of(se); 4782 p = task_of(se);
4783
4513 if (hrtick_enabled(rq)) 4784 if (hrtick_enabled(rq))
4514 hrtick_start_fair(rq, p); 4785 hrtick_start_fair(rq, p);
4515 4786
4516 return p; 4787 return p;
4788
4789idle:
4790 new_tasks = idle_balance(rq);
4791 /*
4792 * Because idle_balance() releases (and re-acquires) rq->lock, it is
4793 * possible for any higher priority task to appear. In that case we
4794 * must re-start the pick_next_entity() loop.
4795 */
4796 if (new_tasks < 0)
4797 return RETRY_TASK;
4798
4799 if (new_tasks > 0)
4800 goto again;
4801
4802 return NULL;
4517} 4803}
4518 4804
4519/* 4805/*
@@ -4751,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
4751 * Is this task likely cache-hot: 5037 * Is this task likely cache-hot:
4752 */ 5038 */
4753static int 5039static int
4754task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 5040task_hot(struct task_struct *p, u64 now)
4755{ 5041{
4756 s64 delta; 5042 s64 delta;
4757 5043
@@ -4785,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4785{ 5071{
4786 int src_nid, dst_nid; 5072 int src_nid, dst_nid;
4787 5073
4788 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5074 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4789 !(env->sd->flags & SD_NUMA)) { 5075 !(env->sd->flags & SD_NUMA)) {
4790 return false; 5076 return false;
4791 } 5077 }
@@ -4816,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4816 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5102 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4817 return false; 5103 return false;
4818 5104
4819 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5105 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4820 return false; 5106 return false;
4821 5107
4822 src_nid = cpu_to_node(env->src_cpu); 5108 src_nid = cpu_to_node(env->src_cpu);
@@ -4912,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4912 * 2) task is cache cold, or 5198 * 2) task is cache cold, or
4913 * 3) too many balance attempts have failed. 5199 * 3) too many balance attempts have failed.
4914 */ 5200 */
4915 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 5201 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
4916 if (!tsk_cache_hot) 5202 if (!tsk_cache_hot)
4917 tsk_cache_hot = migrate_degrades_locality(p, env); 5203 tsk_cache_hot = migrate_degrades_locality(p, env);
4918 5204
@@ -5775,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5775 pwr_now /= SCHED_POWER_SCALE; 6061 pwr_now /= SCHED_POWER_SCALE;
5776 6062
5777 /* Amount of load we'd subtract */ 6063 /* Amount of load we'd subtract */
5778 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6064 if (busiest->avg_load > scaled_busy_load_per_task) {
5779 busiest->group_power;
5780 if (busiest->avg_load > tmp) {
5781 pwr_move += busiest->group_power * 6065 pwr_move += busiest->group_power *
5782 min(busiest->load_per_task, 6066 min(busiest->load_per_task,
5783 busiest->avg_load - tmp); 6067 busiest->avg_load - scaled_busy_load_per_task);
5784 } 6068 }
5785 6069
5786 /* Amount of load we'd add */ 6070 /* Amount of load we'd add */
@@ -6359,17 +6643,23 @@ out:
6359 * idle_balance is called by schedule() if this_cpu is about to become 6643 * idle_balance is called by schedule() if this_cpu is about to become
6360 * idle. Attempts to pull tasks from other CPUs. 6644 * idle. Attempts to pull tasks from other CPUs.
6361 */ 6645 */
6362void idle_balance(int this_cpu, struct rq *this_rq) 6646static int idle_balance(struct rq *this_rq)
6363{ 6647{
6364 struct sched_domain *sd; 6648 struct sched_domain *sd;
6365 int pulled_task = 0; 6649 int pulled_task = 0;
6366 unsigned long next_balance = jiffies + HZ; 6650 unsigned long next_balance = jiffies + HZ;
6367 u64 curr_cost = 0; 6651 u64 curr_cost = 0;
6652 int this_cpu = this_rq->cpu;
6368 6653
6654 idle_enter_fair(this_rq);
6655 /*
6656 * We must set idle_stamp _before_ calling idle_balance(), such that we
6657 * measure the duration of idle_balance() as idle time.
6658 */
6369 this_rq->idle_stamp = rq_clock(this_rq); 6659 this_rq->idle_stamp = rq_clock(this_rq);
6370 6660
6371 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6661 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6372 return; 6662 goto out;
6373 6663
6374 /* 6664 /*
6375 * Drop the rq->lock, but keep IRQ/preempt disabled. 6665 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6407,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6407 interval = msecs_to_jiffies(sd->balance_interval); 6697 interval = msecs_to_jiffies(sd->balance_interval);
6408 if (time_after(next_balance, sd->last_balance + interval)) 6698 if (time_after(next_balance, sd->last_balance + interval))
6409 next_balance = sd->last_balance + interval; 6699 next_balance = sd->last_balance + interval;
6410 if (pulled_task) { 6700 if (pulled_task)
6411 this_rq->idle_stamp = 0;
6412 break; 6701 break;
6413 }
6414 } 6702 }
6415 rcu_read_unlock(); 6703 rcu_read_unlock();
6416 6704
6417 raw_spin_lock(&this_rq->lock); 6705 raw_spin_lock(&this_rq->lock);
6418 6706
6707 /*
6708 * While browsing the domains, we released the rq lock.
6709 * A task could have be enqueued in the meantime
6710 */
6711 if (this_rq->cfs.h_nr_running && !pulled_task) {
6712 pulled_task = 1;
6713 goto out;
6714 }
6715
6419 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6716 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6420 /* 6717 /*
6421 * We are going idle. next_balance may be set based on 6718 * We are going idle. next_balance may be set based on
@@ -6426,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6426 6723
6427 if (curr_cost > this_rq->max_idle_balance_cost) 6724 if (curr_cost > this_rq->max_idle_balance_cost)
6428 this_rq->max_idle_balance_cost = curr_cost; 6725 this_rq->max_idle_balance_cost = curr_cost;
6726
6727out:
6728 /* Is there a task of a high priority class? */
6729 if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
6730 (this_rq->dl.dl_nr_running ||
6731 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6732 pulled_task = -1;
6733
6734 if (pulled_task) {
6735 idle_exit_fair(this_rq);
6736 this_rq->idle_stamp = 0;
6737 }
6738
6739 return pulled_task;
6429} 6740}
6430 6741
6431/* 6742/*
@@ -6496,6 +6807,11 @@ out_unlock:
6496 return 0; 6807 return 0;
6497} 6808}
6498 6809
6810static inline int on_null_domain(struct rq *rq)
6811{
6812 return unlikely(!rcu_dereference_sched(rq->sd));
6813}
6814
6499#ifdef CONFIG_NO_HZ_COMMON 6815#ifdef CONFIG_NO_HZ_COMMON
6500/* 6816/*
6501 * idle load balancing details 6817 * idle load balancing details
@@ -6550,8 +6866,13 @@ static void nohz_balancer_kick(void)
6550static inline void nohz_balance_exit_idle(int cpu) 6866static inline void nohz_balance_exit_idle(int cpu)
6551{ 6867{
6552 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 6868 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6553 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 6869 /*
6554 atomic_dec(&nohz.nr_cpus); 6870 * Completely isolated CPUs don't ever set, so we must test.
6871 */
6872 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6873 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6874 atomic_dec(&nohz.nr_cpus);
6875 }
6555 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6876 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6556 } 6877 }
6557} 6878}
@@ -6605,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
6605 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 6926 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6606 return; 6927 return;
6607 6928
6929 /*
6930 * If we're a completely isolated CPU, we don't play.
6931 */
6932 if (on_null_domain(cpu_rq(cpu)))
6933 return;
6934
6608 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 6935 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6609 atomic_inc(&nohz.nr_cpus); 6936 atomic_inc(&nohz.nr_cpus);
6610 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6937 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6867,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
6867 nohz_idle_balance(this_rq, idle); 7194 nohz_idle_balance(this_rq, idle);
6868} 7195}
6869 7196
6870static inline int on_null_domain(struct rq *rq)
6871{
6872 return !rcu_dereference_sched(rq->sd);
6873}
6874
6875/* 7197/*
6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 7198 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6877 */ 7199 */
@@ -7036,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7036 */ 7358 */
7037static void switched_to_fair(struct rq *rq, struct task_struct *p) 7359static void switched_to_fair(struct rq *rq, struct task_struct *p)
7038{ 7360{
7039 if (!p->se.on_rq) 7361 struct sched_entity *se = &p->se;
7362#ifdef CONFIG_FAIR_GROUP_SCHED
7363 /*
7364 * Since the real-depth could have been changed (only FAIR
7365 * class maintain depth value), reset depth properly.
7366 */
7367 se->depth = se->parent ? se->parent->depth + 1 : 0;
7368#endif
7369 if (!se->on_rq)
7040 return; 7370 return;
7041 7371
7042 /* 7372 /*
@@ -7084,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7084#ifdef CONFIG_FAIR_GROUP_SCHED 7414#ifdef CONFIG_FAIR_GROUP_SCHED
7085static void task_move_group_fair(struct task_struct *p, int on_rq) 7415static void task_move_group_fair(struct task_struct *p, int on_rq)
7086{ 7416{
7417 struct sched_entity *se = &p->se;
7087 struct cfs_rq *cfs_rq; 7418 struct cfs_rq *cfs_rq;
7419
7088 /* 7420 /*
7089 * If the task was not on the rq at the time of this cgroup movement 7421 * If the task was not on the rq at the time of this cgroup movement
7090 * it must have been asleep, sleeping tasks keep their ->vruntime 7422 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7110,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7110 * To prevent boost or penalty in the new cfs_rq caused by delta 7442 * To prevent boost or penalty in the new cfs_rq caused by delta
7111 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7443 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7112 */ 7444 */
7113 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7445 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7114 on_rq = 1; 7446 on_rq = 1;
7115 7447
7116 if (!on_rq) 7448 if (!on_rq)
7117 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7449 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7118 set_task_rq(p, task_cpu(p)); 7450 set_task_rq(p, task_cpu(p));
7451 se->depth = se->parent ? se->parent->depth + 1 : 0;
7119 if (!on_rq) { 7452 if (!on_rq) {
7120 cfs_rq = cfs_rq_of(&p->se); 7453 cfs_rq = cfs_rq_of(se);
7121 p->se.vruntime += cfs_rq->min_vruntime; 7454 se->vruntime += cfs_rq->min_vruntime;
7122#ifdef CONFIG_SMP 7455#ifdef CONFIG_SMP
7123 /* 7456 /*
7124 * migrate_task_rq_fair() will have removed our previous 7457 * migrate_task_rq_fair() will have removed our previous
7125 * contribution, but we must synchronize for ongoing future 7458 * contribution, but we must synchronize for ongoing future
7126 * decay. 7459 * decay.
7127 */ 7460 */
7128 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7461 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7129 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7462 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7130#endif 7463#endif
7131 } 7464 }
7132} 7465}
@@ -7222,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7222 if (!se) 7555 if (!se)
7223 return; 7556 return;
7224 7557
7225 if (!parent) 7558 if (!parent) {
7226 se->cfs_rq = &rq->cfs; 7559 se->cfs_rq = &rq->cfs;
7227 else 7560 se->depth = 0;
7561 } else {
7228 se->cfs_rq = parent->my_q; 7562 se->cfs_rq = parent->my_q;
7563 se->depth = parent->depth + 1;
7564 }
7229 7565
7230 se->my_q = cfs_rq; 7566 se->my_q = cfs_rq;
7231 /* guarantee group entities always have weight */ 7567 /* guarantee group entities always have weight */
diff --git a/kernel/cpu/idle.c b/kernel/sched/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/sched/idle.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
6#include <linux/tick.h> 7#include <linux/tick.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
95 if (!current_clr_polling_and_test()) { 96 if (!current_clr_polling_and_test()) {
96 stop_critical_timings(); 97 stop_critical_timings();
97 rcu_idle_enter(); 98 rcu_idle_enter();
98 arch_cpu_idle(); 99 if (cpuidle_idle_call())
99 WARN_ON_ONCE(irqs_disabled()); 100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
100 rcu_idle_exit(); 103 rcu_idle_exit();
101 start_critical_timings(); 104 start_critical_timings();
102 } else { 105 } else {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 put_prev_task(rq, prev);
30
38 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
43 return rq->idle; 32 return rq->idle;
44} 33}
45 34
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 47
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 49{
50 idle_exit_fair(rq);
51 rq_last_tick_reset(rq);
61} 52}
62 53
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
101 92
102#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 94 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 95#endif
107 96
108 .set_curr_task = set_curr_task_idle, 97 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1999021042c7..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
234static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
235{
236 /* Try to pull RT tasks here if we lower this rq's prio */
237 return rq->rt.highest_prio.curr > prev->prio;
238}
239
232static inline int rt_overloaded(struct rq *rq) 240static inline int rt_overloaded(struct rq *rq)
233{ 241{
234 return atomic_read(&rq->rd->rto_count); 242 return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
315 return !plist_head_empty(&rq->rt.pushable_tasks); 323 return !plist_head_empty(&rq->rt.pushable_tasks);
316} 324}
317 325
326static inline void set_post_schedule(struct rq *rq)
327{
328 /*
329 * We detect this state here so that we can avoid taking the RQ
330 * lock again later if there is no need to push
331 */
332 rq->post_schedule = has_pushable_tasks(rq);
333}
334
318static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 335static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
319{ 336{
320 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 337 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
359{ 376{
360} 377}
361 378
379static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
380{
381 return false;
382}
383
384static inline int pull_rt_task(struct rq *this_rq)
385{
386 return 0;
387}
388
389static inline void set_post_schedule(struct rq *rq)
390{
391}
362#endif /* CONFIG_SMP */ 392#endif /* CONFIG_SMP */
363 393
364static inline int on_rt_rq(struct sched_rt_entity *rt_se) 394static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
440 dequeue_rt_entity(rt_se); 470 dequeue_rt_entity(rt_se);
441} 471}
442 472
443static inline int rt_rq_throttled(struct rt_rq *rt_rq)
444{
445 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
446}
447
448static int rt_se_boosted(struct sched_rt_entity *rt_se) 473static int rt_se_boosted(struct sched_rt_entity *rt_se)
449{ 474{
450 struct rt_rq *rt_rq = group_rt_rq(rt_se); 475 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
515{ 540{
516} 541}
517 542
518static inline int rt_rq_throttled(struct rt_rq *rt_rq)
519{
520 return rt_rq->rt_throttled;
521}
522
523static inline const struct cpumask *sched_rt_period_mask(void) 543static inline const struct cpumask *sched_rt_period_mask(void)
524{ 544{
525 return cpu_online_mask; 545 return cpu_online_mask;
@@ -1318,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1318{ 1338{
1319 struct sched_rt_entity *rt_se; 1339 struct sched_rt_entity *rt_se;
1320 struct task_struct *p; 1340 struct task_struct *p;
1321 struct rt_rq *rt_rq; 1341 struct rt_rq *rt_rq = &rq->rt;
1322
1323 rt_rq = &rq->rt;
1324
1325 if (!rt_rq->rt_nr_running)
1326 return NULL;
1327
1328 if (rt_rq_throttled(rt_rq))
1329 return NULL;
1330 1342
1331 do { 1343 do {
1332 rt_se = pick_next_rt_entity(rq, rt_rq); 1344 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1340,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1340 return p; 1352 return p;
1341} 1353}
1342 1354
1343static struct task_struct *pick_next_task_rt(struct rq *rq) 1355static struct task_struct *
1356pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1344{ 1357{
1345 struct task_struct *p = _pick_next_task_rt(rq); 1358 struct task_struct *p;
1359 struct rt_rq *rt_rq = &rq->rt;
1360
1361 if (need_pull_rt_task(rq, prev)) {
1362 pull_rt_task(rq);
1363 /*
1364 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1365 * means a dl task can slip in, in which case we need to
1366 * re-start task selection.
1367 */
1368 if (unlikely(rq->dl.dl_nr_running))
1369 return RETRY_TASK;
1370 }
1371
1372 /*
1373 * We may dequeue prev's rt_rq in put_prev_task().
1374 * So, we update time before rt_nr_running check.
1375 */
1376 if (prev->sched_class == &rt_sched_class)
1377 update_curr_rt(rq);
1378
1379 if (!rt_rq->rt_nr_running)
1380 return NULL;
1381
1382 if (rt_rq_throttled(rt_rq))
1383 return NULL;
1384
1385 put_prev_task(rq, prev);
1386
1387 p = _pick_next_task_rt(rq);
1346 1388
1347 /* The running task is never eligible for pushing */ 1389 /* The running task is never eligible for pushing */
1348 if (p) 1390 if (p)
1349 dequeue_pushable_task(rq, p); 1391 dequeue_pushable_task(rq, p);
1350 1392
1351#ifdef CONFIG_SMP 1393 set_post_schedule(rq);
1352 /*
1353 * We detect this state here so that we can avoid taking the RQ
1354 * lock again later if there is no need to push
1355 */
1356 rq->post_schedule = has_pushable_tasks(rq);
1357#endif
1358 1394
1359 return p; 1395 return p;
1360} 1396}
@@ -1724,13 +1760,6 @@ skip:
1724 return ret; 1760 return ret;
1725} 1761}
1726 1762
1727static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1728{
1729 /* Try to pull RT tasks here if we lower this rq's prio */
1730 if (rq->rt.highest_prio.curr > prev->prio)
1731 pull_rt_task(rq);
1732}
1733
1734static void post_schedule_rt(struct rq *rq) 1763static void post_schedule_rt(struct rq *rq)
1735{ 1764{
1736 push_rt_tasks(rq); 1765 push_rt_tasks(rq);
@@ -1833,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1833 resched_task(rq->curr); 1862 resched_task(rq->curr);
1834} 1863}
1835 1864
1836void init_sched_rt_class(void) 1865void __init init_sched_rt_class(void)
1837{ 1866{
1838 unsigned int i; 1867 unsigned int i;
1839 1868
@@ -2007,7 +2036,6 @@ const struct sched_class rt_sched_class = {
2007 .set_cpus_allowed = set_cpus_allowed_rt, 2036 .set_cpus_allowed = set_cpus_allowed_rt,
2008 .rq_online = rq_online_rt, 2037 .rq_online = rq_online_rt,
2009 .rq_offline = rq_offline_rt, 2038 .rq_offline = rq_offline_rt,
2010 .pre_schedule = pre_schedule_rt,
2011 .post_schedule = post_schedule_rt, 2039 .post_schedule = post_schedule_rt,
2012 .task_woken = task_woken_rt, 2040 .task_woken = task_woken_rt,
2013 .switched_from = switched_from_rt, 2041 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f964add50f38..f2de7a175620 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
441#endif 423#endif
442}; 424};
443 425
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
444/* Deadline class' related fields in a runqueue */ 438/* Deadline class' related fields in a runqueue */
445struct dl_rq { 439struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */ 440 /* runqueue is an rbtree, ordered by deadline */
@@ -558,11 +552,9 @@ struct rq {
558#ifdef CONFIG_FAIR_GROUP_SCHED 552#ifdef CONFIG_FAIR_GROUP_SCHED
559 /* list of leaf cfs_rq on this cpu: */ 553 /* list of leaf cfs_rq on this cpu: */
560 struct list_head leaf_cfs_rq_list; 554 struct list_head leaf_cfs_rq_list;
561#endif /* CONFIG_FAIR_GROUP_SCHED */
562 555
563#ifdef CONFIG_RT_GROUP_SCHED 556 struct sched_avg avg;
564 struct list_head leaf_rt_rq_list; 557#endif /* CONFIG_FAIR_GROUP_SCHED */
565#endif
566 558
567 /* 559 /*
568 * This is part of a global counter where only the total sum 560 * This is part of a global counter where only the total sum
@@ -651,8 +643,6 @@ struct rq {
651#ifdef CONFIG_SMP 643#ifdef CONFIG_SMP
652 struct llist_head wake_list; 644 struct llist_head wake_list;
653#endif 645#endif
654
655 struct sched_avg avg;
656}; 646};
657 647
658static inline int cpu_of(struct rq *rq) 648static inline int cpu_of(struct rq *rq)
@@ -1112,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
1112 1102
1113#define DEQUEUE_SLEEP 1 1103#define DEQUEUE_SLEEP 1
1114 1104
1105#define RETRY_TASK ((void *)-1UL)
1106
1115struct sched_class { 1107struct sched_class {
1116 const struct sched_class *next; 1108 const struct sched_class *next;
1117 1109
@@ -1122,14 +1114,22 @@ struct sched_class {
1122 1114
1123 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1115 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1124 1116
1125 struct task_struct * (*pick_next_task) (struct rq *rq); 1117 /*
1118 * It is the responsibility of the pick_next_task() method that will
1119 * return the next task to call put_prev_task() on the @prev task or
1120 * something equivalent.
1121 *
1122 * May return RETRY_TASK when it finds a higher prio class has runnable
1123 * tasks.
1124 */
1125 struct task_struct * (*pick_next_task) (struct rq *rq,
1126 struct task_struct *prev);
1126 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1127 1128
1128#ifdef CONFIG_SMP 1129#ifdef CONFIG_SMP
1129 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1130 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1131 1132
1132 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1133 void (*post_schedule) (struct rq *this_rq); 1133 void (*post_schedule) (struct rq *this_rq);
1134 void (*task_waking) (struct task_struct *task); 1134 void (*task_waking) (struct task_struct *task);
1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1159,6 +1159,11 @@ struct sched_class {
1159#endif 1159#endif
1160}; 1160};
1161 1161
1162static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1163{
1164 prev->sched_class->put_prev_task(rq, prev);
1165}
1166
1162#define sched_class_highest (&stop_sched_class) 1167#define sched_class_highest (&stop_sched_class)
1163#define for_each_class(class) \ 1168#define for_each_class(class) \
1164 for (class = sched_class_highest; class; class = class->next) 1169 for (class = sched_class_highest; class; class = class->next)
@@ -1175,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
1175extern void update_group_power(struct sched_domain *sd, int cpu); 1180extern void update_group_power(struct sched_domain *sd, int cpu);
1176 1181
1177extern void trigger_load_balance(struct rq *rq); 1182extern void trigger_load_balance(struct rq *rq);
1178extern void idle_balance(int this_cpu, struct rq *this_rq);
1179 1183
1180extern void idle_enter_fair(struct rq *this_rq); 1184extern void idle_enter_fair(struct rq *this_rq);
1181extern void idle_exit_fair(struct rq *this_rq); 1185extern void idle_exit_fair(struct rq *this_rq);
1182 1186
1183#else /* CONFIG_SMP */ 1187#else
1184 1188
1185static inline void idle_balance(int cpu, struct rq *rq) 1189static inline void idle_enter_fair(struct rq *rq) { }
1186{ 1190static inline void idle_exit_fair(struct rq *rq) { }
1187}
1188 1191
1189#endif 1192#endif
1190 1193
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 put_prev_task(rq, prev);
35
36 stop->se.exec_start = rq_clock_task(rq);
37
38 return stop;
36} 39}
37 40
38static void 41static void
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
174 174
175 /* normalize: avoid signed division (rounding problems) */ 175 /* normalize: avoid signed division (rounding problems) */
176 error = -ESRCH; 176 error = -ESRCH;
177 if (niceval < -20) 177 if (niceval < MIN_NICE)
178 niceval = -20; 178 niceval = MIN_NICE;
179 if (niceval > 19) 179 if (niceval > MAX_NICE)
180 niceval = 19; 180 niceval = MAX_NICE;
181 181
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
387 }, 387 },
388 { 388 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 389 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 390 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
40module_param(write_iteration, uint, 0644); 40module_param(write_iteration, uint, 0644);
41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); 41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
42 42
43static int producer_nice = 19; 43static int producer_nice = MAX_NICE;
44static int consumer_nice = 19; 44static int consumer_nice = MAX_NICE;
45 45
46static int producer_fifo = -1; 46static int producer_fifo = -1;
47static int consumer_fifo = -1; 47static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
308 308
309 /* Let the user know that the test is running at low priority */ 309 /* Let the user know that the test is running at low priority */
310 if (producer_fifo < 0 && consumer_fifo < 0 && 310 if (producer_fifo < 0 && consumer_fifo < 0 &&
311 producer_nice == 19 && consumer_nice == 19) 311 producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
312 trace_printk("WARNING!!! This test is running at lowest priority.\n"); 312 trace_printk("WARNING!!! This test is running at lowest priority.\n");
313 313
314 trace_printk("Time: %lld (usecs)\n", time); 314 trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977a10ea..3fa5b8f3aae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3225,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3225 return -ENOMEM; 3225 return -ENOMEM;
3226 3226
3227 if (sscanf(buf, "%d", &attrs->nice) == 1 && 3227 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3228 attrs->nice >= -20 && attrs->nice <= 19) 3228 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3229 ret = apply_workqueue_attrs(wq, attrs); 3229 ret = apply_workqueue_attrs(wq, attrs);
3230 else 3230 else
3231 ret = -EINVAL; 3231 ret = -EINVAL;