aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-13 10:23:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-13 10:23:15 -0400
commitfaafcba3b5e15999cf75d5c5a513ac8e47e2545f (patch)
tree47d58d1c00e650e820506c91eb9a41268756bdda /kernel
parent13ead805c5a14b0e7ecd34f61404a5bfba655895 (diff)
parentf10e00f4bf360c36edbe6bf18a6c75b171cbe012 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Optimized support for Intel "Cluster-on-Die" (CoD) topologies (Dave Hansen) - Various sched/idle refinements for better idle handling (Nicolas Pitre, Daniel Lezcano, Chuansheng Liu, Vincent Guittot) - sched/numa updates and optimizations (Rik van Riel) - sysbench speedup (Vincent Guittot) - capacity calculation cleanups/refactoring (Vincent Guittot) - Various cleanups to thread group iteration (Oleg Nesterov) - Double-rq-lock removal optimization and various refactorings (Kirill Tkhai) - various sched/deadline fixes ... and lots of other changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits) sched/dl: Use dl_bw_of() under rcu_read_lock_sched() sched/fair: Delete resched_cpu() from idle_balance() sched, time: Fix build error with 64 bit cputime_t on 32 bit systems sched: Improve sysbench performance by fixing spurious active migration sched/x86: Fix up typo in topology detection x86, sched: Add new topology for multi-NUMA-node CPUs sched/rt: Use resched_curr() in task_tick_rt() sched: Use rq->rd in sched_setaffinity() under RCU read lock sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask' sched: Use dl_bw_of() under RCU read lock sched/fair: Remove duplicate code from can_migrate_task() sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW sched: print_rq(): Don't use tasklist_lock sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock() sched: Fix the task-group check in tg_has_rt_tasks() sched/fair: Leverage the idle state info when choosing the "idlest" cpu sched: Let the scheduler see CPU idle states sched/deadline: Fix inter- exclusive cpusets migrations sched/deadline: Clear dl_entity params when setscheduling to different class sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c47
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/sched/auto_group.c5
-rw-r--r--kernel/sched/core.c295
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/debug.c13
-rw-r--r--kernel/sched/fair.c479
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/rt.c21
-rw-r--r--kernel/sched/sched.h80
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/smp.c22
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/hrtimer.c1
-rw-r--r--kernel/time/posix-cpu-timers.c14
-rw-r--r--kernel/trace/ring_buffer_benchmark.c3
-rw-r--r--kernel/trace/trace_stack.c4
19 files changed, 667 insertions, 441 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index d13f2eec4bb8..5d30019ff953 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk)
115 115
116 if (tsk == sig->curr_target) 116 if (tsk == sig->curr_target)
117 sig->curr_target = next_thread(tsk); 117 sig->curr_target = next_thread(tsk);
118 /*
119 * Accumulate here the counters for all threads but the
120 * group leader as they die, so they can be added into
121 * the process-wide totals when those are taken.
122 * The group leader stays around as a zombie as long
123 * as there are other threads. When it gets reaped,
124 * the exit.c code will add its counts into these totals.
125 * We won't ever get here for the group leader, since it
126 * will have been the last reference on the signal_struct.
127 */
128 task_cputime(tsk, &utime, &stime);
129 sig->utime += utime;
130 sig->stime += stime;
131 sig->gtime += task_gtime(tsk);
132 sig->min_flt += tsk->min_flt;
133 sig->maj_flt += tsk->maj_flt;
134 sig->nvcsw += tsk->nvcsw;
135 sig->nivcsw += tsk->nivcsw;
136 sig->inblock += task_io_get_inblock(tsk);
137 sig->oublock += task_io_get_oublock(tsk);
138 task_io_accounting_add(&sig->ioac, &tsk->ioac);
139 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
140 } 118 }
141 119
120 /*
121 * Accumulate here the counters for all threads but the group leader
122 * as they die, so they can be added into the process-wide totals
123 * when those are taken. The group leader stays around as a zombie as
124 * long as there are other threads. When it gets reaped, the exit.c
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */
129 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock);
131 sig->utime += utime;
132 sig->stime += stime;
133 sig->gtime += task_gtime(tsk);
134 sig->min_flt += tsk->min_flt;
135 sig->maj_flt += tsk->maj_flt;
136 sig->nvcsw += tsk->nvcsw;
137 sig->nivcsw += tsk->nivcsw;
138 sig->inblock += task_io_get_inblock(tsk);
139 sig->oublock += task_io_get_oublock(tsk);
140 task_io_accounting_add(&sig->ioac, &tsk->ioac);
141 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
142 sig->nr_threads--; 142 sig->nr_threads--;
143 __unhash_process(tsk, group_dead); 143 __unhash_process(tsk, group_dead);
144 write_sequnlock(&sig->stats_lock);
144 145
145 /* 146 /*
146 * Do this under ->siglock, we can race with another thread 147 * Do this under ->siglock, we can race with another thread
@@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1046 spin_lock_irq(&p->real_parent->sighand->siglock); 1047 spin_lock_irq(&p->real_parent->sighand->siglock);
1047 psig = p->real_parent->signal; 1048 psig = p->real_parent->signal;
1048 sig = p->signal; 1049 sig = p->signal;
1050 write_seqlock(&psig->stats_lock);
1049 psig->cutime += tgutime + sig->cutime; 1051 psig->cutime += tgutime + sig->cutime;
1050 psig->cstime += tgstime + sig->cstime; 1052 psig->cstime += tgstime + sig->cstime;
1051 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1053 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1068 psig->cmaxrss = maxrss; 1070 psig->cmaxrss = maxrss;
1069 task_io_accounting_add(&psig->ioac, &p->ioac); 1071 task_io_accounting_add(&psig->ioac, &p->ioac);
1070 task_io_accounting_add(&psig->ioac, &sig->ioac); 1072 task_io_accounting_add(&psig->ioac, &sig->ioac);
1073 write_sequnlock(&psig->stats_lock);
1071 spin_unlock_irq(&p->real_parent->sighand->siglock); 1074 spin_unlock_irq(&p->real_parent->sighand->siglock);
1072 } 1075 }
1073 1076
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c162d102740..9b7d746d6d62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
294 return 0; 294 return 0;
295} 295}
296 296
297void set_task_stack_end_magic(struct task_struct *tsk)
298{
299 unsigned long *stackend;
300
301 stackend = end_of_stack(tsk);
302 *stackend = STACK_END_MAGIC; /* for overflow detection */
303}
304
297static struct task_struct *dup_task_struct(struct task_struct *orig) 305static struct task_struct *dup_task_struct(struct task_struct *orig)
298{ 306{
299 struct task_struct *tsk; 307 struct task_struct *tsk;
300 struct thread_info *ti; 308 struct thread_info *ti;
301 unsigned long *stackend;
302 int node = tsk_fork_get_node(orig); 309 int node = tsk_fork_get_node(orig);
303 int err; 310 int err;
304 311
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
328 setup_thread_stack(tsk, orig); 335 setup_thread_stack(tsk, orig);
329 clear_user_return_notifier(tsk); 336 clear_user_return_notifier(tsk);
330 clear_tsk_need_resched(tsk); 337 clear_tsk_need_resched(tsk);
331 stackend = end_of_stack(tsk); 338 set_task_stack_end_magic(tsk);
332 *stackend = STACK_END_MAGIC; /* for overflow detection */
333 339
334#ifdef CONFIG_CC_STACKPROTECTOR 340#ifdef CONFIG_CC_STACKPROTECTOR
335 tsk->stack_canary = get_random_int(); 341 tsk->stack_canary = get_random_int();
@@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1067 sig->curr_target = tsk; 1073 sig->curr_target = tsk;
1068 init_sigpending(&sig->shared_pending); 1074 init_sigpending(&sig->shared_pending);
1069 INIT_LIST_HEAD(&sig->posix_timers); 1075 INIT_LIST_HEAD(&sig->posix_timers);
1076 seqlock_init(&sig->stats_lock);
1070 1077
1071 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1078 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1072 sig->real_timer.function = it_real_fn; 1079 sig->real_timer.function = it_real_fn;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
149 goto out; 149 goto out;
150 150
151 t = p; 151 for_each_thread(p, t)
152 do {
153 sched_move_task(t); 152 sched_move_task(t);
154 } while_each_thread(p, t);
155
156out: 153out:
157 unlock_task_sighand(p, &flags); 154 unlock_task_sighand(p, &flags);
158 autogroup_kref_put(prev); 155 autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f235c41a3532..44999505e1bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
317 for (;;) { 317 for (;;) {
318 rq = task_rq(p); 318 rq = task_rq(p);
319 raw_spin_lock(&rq->lock); 319 raw_spin_lock(&rq->lock);
320 if (likely(rq == task_rq(p))) 320 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
321 return rq; 321 return rq;
322 raw_spin_unlock(&rq->lock); 322 raw_spin_unlock(&rq->lock);
323
324 while (unlikely(task_on_rq_migrating(p)))
325 cpu_relax();
323 } 326 }
324} 327}
325 328
@@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
336 raw_spin_lock_irqsave(&p->pi_lock, *flags); 339 raw_spin_lock_irqsave(&p->pi_lock, *flags);
337 rq = task_rq(p); 340 rq = task_rq(p);
338 raw_spin_lock(&rq->lock); 341 raw_spin_lock(&rq->lock);
339 if (likely(rq == task_rq(p))) 342 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
340 return rq; 343 return rq;
341 raw_spin_unlock(&rq->lock); 344 raw_spin_unlock(&rq->lock);
342 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 345 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
346
347 while (unlikely(task_on_rq_migrating(p)))
348 cpu_relax();
343 } 349 }
344} 350}
345 351
@@ -433,7 +439,15 @@ static void __hrtick_start(void *arg)
433void hrtick_start(struct rq *rq, u64 delay) 439void hrtick_start(struct rq *rq, u64 delay)
434{ 440{
435 struct hrtimer *timer = &rq->hrtick_timer; 441 struct hrtimer *timer = &rq->hrtick_timer;
436 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 442 ktime_t time;
443 s64 delta;
444
445 /*
446 * Don't schedule slices shorter than 10000ns, that just
447 * doesn't make sense and can cause timer DoS.
448 */
449 delta = max_t(s64, delay, 10000LL);
450 time = ktime_add_ns(timer->base->get_time(), delta);
437 451
438 hrtimer_set_expires(timer, time); 452 hrtimer_set_expires(timer, time);
439 453
@@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1027 * A queue event has occurred, and we're going to schedule. In 1041 * A queue event has occurred, and we're going to schedule. In
1028 * this case, we can save a useless back to back clock update. 1042 * this case, we can save a useless back to back clock update.
1029 */ 1043 */
1030 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1044 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1031 rq->skip_clock_update = 1; 1045 rq->skip_clock_update = 1;
1032} 1046}
1033 1047
@@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1072 1086
1073static void __migrate_swap_task(struct task_struct *p, int cpu) 1087static void __migrate_swap_task(struct task_struct *p, int cpu)
1074{ 1088{
1075 if (p->on_rq) { 1089 if (task_on_rq_queued(p)) {
1076 struct rq *src_rq, *dst_rq; 1090 struct rq *src_rq, *dst_rq;
1077 1091
1078 src_rq = task_rq(p); 1092 src_rq = task_rq(p);
@@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data);
1198unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1212unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1199{ 1213{
1200 unsigned long flags; 1214 unsigned long flags;
1201 int running, on_rq; 1215 int running, queued;
1202 unsigned long ncsw; 1216 unsigned long ncsw;
1203 struct rq *rq; 1217 struct rq *rq;
1204 1218
@@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1236 rq = task_rq_lock(p, &flags); 1250 rq = task_rq_lock(p, &flags);
1237 trace_sched_wait_task(p); 1251 trace_sched_wait_task(p);
1238 running = task_running(rq, p); 1252 running = task_running(rq, p);
1239 on_rq = p->on_rq; 1253 queued = task_on_rq_queued(p);
1240 ncsw = 0; 1254 ncsw = 0;
1241 if (!match_state || p->state == match_state) 1255 if (!match_state || p->state == match_state)
1242 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1256 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1268 * running right now), it's preempted, and we should 1282 * running right now), it's preempted, and we should
1269 * yield - it could be a while. 1283 * yield - it could be a while.
1270 */ 1284 */
1271 if (unlikely(on_rq)) { 1285 if (unlikely(queued)) {
1272 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1286 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1273 1287
1274 set_current_state(TASK_UNINTERRUPTIBLE); 1288 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1462static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1476static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1463{ 1477{
1464 activate_task(rq, p, en_flags); 1478 activate_task(rq, p, en_flags);
1465 p->on_rq = 1; 1479 p->on_rq = TASK_ON_RQ_QUEUED;
1466 1480
1467 /* if a worker is waking up, notify workqueue */ 1481 /* if a worker is waking up, notify workqueue */
1468 if (p->flags & PF_WQ_WORKER) 1482 if (p->flags & PF_WQ_WORKER)
@@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1521 int ret = 0; 1535 int ret = 0;
1522 1536
1523 rq = __task_rq_lock(p); 1537 rq = __task_rq_lock(p);
1524 if (p->on_rq) { 1538 if (task_on_rq_queued(p)) {
1525 /* check_preempt_curr() may use rq clock */ 1539 /* check_preempt_curr() may use rq clock */
1526 update_rq_clock(rq); 1540 update_rq_clock(rq);
1527 ttwu_do_wakeup(rq, p, wake_flags); 1541 ttwu_do_wakeup(rq, p, wake_flags);
@@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1604 } 1618 }
1605} 1619}
1606 1620
1621void wake_up_if_idle(int cpu)
1622{
1623 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags;
1625
1626 if (!is_idle_task(rq->curr))
1627 return;
1628
1629 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu);
1631 } else {
1632 raw_spin_lock_irqsave(&rq->lock, flags);
1633 if (is_idle_task(rq->curr))
1634 smp_send_reschedule(cpu);
1635 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 }
1638}
1639
1607bool cpus_share_cache(int this_cpu, int that_cpu) 1640bool cpus_share_cache(int this_cpu, int that_cpu)
1608{ 1641{
1609 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1642 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p)
1726 if (!(p->state & TASK_NORMAL)) 1759 if (!(p->state & TASK_NORMAL))
1727 goto out; 1760 goto out;
1728 1761
1729 if (!p->on_rq) 1762 if (!task_on_rq_queued(p))
1730 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1763 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1731 1764
1732 ttwu_do_wakeup(rq, p, 0); 1765 ttwu_do_wakeup(rq, p, 0);
@@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1760} 1793}
1761 1794
1762/* 1795/*
1796 * This function clears the sched_dl_entity static params.
1797 */
1798void __dl_clear_params(struct task_struct *p)
1799{
1800 struct sched_dl_entity *dl_se = &p->dl;
1801
1802 dl_se->dl_runtime = 0;
1803 dl_se->dl_deadline = 0;
1804 dl_se->dl_period = 0;
1805 dl_se->flags = 0;
1806 dl_se->dl_bw = 0;
1807}
1808
1809/*
1763 * Perform scheduler related setup for a newly forked process p. 1810 * Perform scheduler related setup for a newly forked process p.
1764 * p is forked by current. 1811 * p is forked by current.
1765 * 1812 *
@@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1783 1830
1784 RB_CLEAR_NODE(&p->dl.rb_node); 1831 RB_CLEAR_NODE(&p->dl.rb_node);
1785 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1832 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1786 p->dl.dl_runtime = p->dl.runtime = 0; 1833 __dl_clear_params(p);
1787 p->dl.dl_deadline = p->dl.deadline = 0;
1788 p->dl.dl_period = 0;
1789 p->dl.flags = 0;
1790 1834
1791 INIT_LIST_HEAD(&p->rt.run_list); 1835 INIT_LIST_HEAD(&p->rt.run_list);
1792 1836
@@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
1961#ifdef CONFIG_SMP 2005#ifdef CONFIG_SMP
1962inline struct dl_bw *dl_bw_of(int i) 2006inline struct dl_bw *dl_bw_of(int i)
1963{ 2007{
2008 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2009 "sched RCU must be held");
1964 return &cpu_rq(i)->rd->dl_bw; 2010 return &cpu_rq(i)->rd->dl_bw;
1965} 2011}
1966 2012
@@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i)
1969 struct root_domain *rd = cpu_rq(i)->rd; 2015 struct root_domain *rd = cpu_rq(i)->rd;
1970 int cpus = 0; 2016 int cpus = 0;
1971 2017
2018 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2019 "sched RCU must be held");
1972 for_each_cpu_and(i, rd->span, cpu_active_mask) 2020 for_each_cpu_and(i, rd->span, cpu_active_mask)
1973 cpus++; 2021 cpus++;
1974 2022
@@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p)
2079 init_task_runnable_average(p); 2127 init_task_runnable_average(p);
2080 rq = __task_rq_lock(p); 2128 rq = __task_rq_lock(p);
2081 activate_task(rq, p, 0); 2129 activate_task(rq, p, 0);
2082 p->on_rq = 1; 2130 p->on_rq = TASK_ON_RQ_QUEUED;
2083 trace_sched_wakeup_new(p, true); 2131 trace_sched_wakeup_new(p, true);
2084 check_preempt_curr(rq, p, WF_FORK); 2132 check_preempt_curr(rq, p, WF_FORK);
2085#ifdef CONFIG_SMP 2133#ifdef CONFIG_SMP
@@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
2271 */ 2319 */
2272 post_schedule(rq); 2320 post_schedule(rq);
2273 2321
2274#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2275 /* In this case, finish_task_switch does not reenable preemption */
2276 preempt_enable();
2277#endif
2278 if (current->set_child_tid) 2322 if (current->set_child_tid)
2279 put_user(task_pid_vnr(current), current->set_child_tid); 2323 put_user(task_pid_vnr(current), current->set_child_tid);
2280} 2324}
@@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2317 * of the scheduler it's an obvious special-case), so we 2361 * of the scheduler it's an obvious special-case), so we
2318 * do an early lockdep release here: 2362 * do an early lockdep release here:
2319 */ 2363 */
2320#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2321 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2364 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2322#endif
2323 2365
2324 context_tracking_task_switch(prev, next); 2366 context_tracking_task_switch(prev, next);
2325 /* Here we just switch the register state and the stack. */ 2367 /* Here we just switch the register state and the stack. */
@@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2447 * project cycles that may never be accounted to this 2489 * project cycles that may never be accounted to this
2448 * thread, breaking clock_gettime(). 2490 * thread, breaking clock_gettime().
2449 */ 2491 */
2450 if (task_current(rq, p) && p->on_rq) { 2492 if (task_current(rq, p) && task_on_rq_queued(p)) {
2451 update_rq_clock(rq); 2493 update_rq_clock(rq);
2452 ns = rq_clock_task(rq) - p->se.exec_start; 2494 ns = rq_clock_task(rq) - p->se.exec_start;
2453 if ((s64)ns < 0) 2495 if ((s64)ns < 0)
@@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2493 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2535 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2494 * been accounted, so we're correct here as well. 2536 * been accounted, so we're correct here as well.
2495 */ 2537 */
2496 if (!p->on_cpu || !p->on_rq) 2538 if (!p->on_cpu || !task_on_rq_queued(p))
2497 return p->se.sum_exec_runtime; 2539 return p->se.sum_exec_runtime;
2498#endif 2540#endif
2499 2541
@@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
2656 */ 2698 */
2657static inline void schedule_debug(struct task_struct *prev) 2699static inline void schedule_debug(struct task_struct *prev)
2658{ 2700{
2701#ifdef CONFIG_SCHED_STACK_END_CHECK
2702 BUG_ON(unlikely(task_stack_end_corrupted(prev)));
2703#endif
2659 /* 2704 /*
2660 * Test if we are atomic. Since do_exit() needs to call into 2705 * Test if we are atomic. Since do_exit() needs to call into
2661 * schedule() atomically, we ignore that path. Otherwise whine 2706 * schedule() atomically, we ignore that path. Otherwise whine
@@ -2797,7 +2842,7 @@ need_resched:
2797 switch_count = &prev->nvcsw; 2842 switch_count = &prev->nvcsw;
2798 } 2843 }
2799 2844
2800 if (prev->on_rq || rq->skip_clock_update < 0) 2845 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2801 update_rq_clock(rq); 2846 update_rq_clock(rq);
2802 2847
2803 next = pick_next_task(rq, prev); 2848 next = pick_next_task(rq, prev);
@@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function);
2962 */ 3007 */
2963void rt_mutex_setprio(struct task_struct *p, int prio) 3008void rt_mutex_setprio(struct task_struct *p, int prio)
2964{ 3009{
2965 int oldprio, on_rq, running, enqueue_flag = 0; 3010 int oldprio, queued, running, enqueue_flag = 0;
2966 struct rq *rq; 3011 struct rq *rq;
2967 const struct sched_class *prev_class; 3012 const struct sched_class *prev_class;
2968 3013
@@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 trace_sched_pi_setprio(p, prio); 3036 trace_sched_pi_setprio(p, prio);
2992 oldprio = p->prio; 3037 oldprio = p->prio;
2993 prev_class = p->sched_class; 3038 prev_class = p->sched_class;
2994 on_rq = p->on_rq; 3039 queued = task_on_rq_queued(p);
2995 running = task_current(rq, p); 3040 running = task_current(rq, p);
2996 if (on_rq) 3041 if (queued)
2997 dequeue_task(rq, p, 0); 3042 dequeue_task(rq, p, 0);
2998 if (running) 3043 if (running)
2999 p->sched_class->put_prev_task(rq, p); 3044 put_prev_task(rq, p);
3000 3045
3001 /* 3046 /*
3002 * Boosting condition are: 3047 * Boosting condition are:
@@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3033 3078
3034 if (running) 3079 if (running)
3035 p->sched_class->set_curr_task(rq); 3080 p->sched_class->set_curr_task(rq);
3036 if (on_rq) 3081 if (queued)
3037 enqueue_task(rq, p, enqueue_flag); 3082 enqueue_task(rq, p, enqueue_flag);
3038 3083
3039 check_class_changed(rq, p, prev_class, oldprio); 3084 check_class_changed(rq, p, prev_class, oldprio);
@@ -3044,7 +3089,7 @@ out_unlock:
3044 3089
3045void set_user_nice(struct task_struct *p, long nice) 3090void set_user_nice(struct task_struct *p, long nice)
3046{ 3091{
3047 int old_prio, delta, on_rq; 3092 int old_prio, delta, queued;
3048 unsigned long flags; 3093 unsigned long flags;
3049 struct rq *rq; 3094 struct rq *rq;
3050 3095
@@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice)
3065 p->static_prio = NICE_TO_PRIO(nice); 3110 p->static_prio = NICE_TO_PRIO(nice);
3066 goto out_unlock; 3111 goto out_unlock;
3067 } 3112 }
3068 on_rq = p->on_rq; 3113 queued = task_on_rq_queued(p);
3069 if (on_rq) 3114 if (queued)
3070 dequeue_task(rq, p, 0); 3115 dequeue_task(rq, p, 0);
3071 3116
3072 p->static_prio = NICE_TO_PRIO(nice); 3117 p->static_prio = NICE_TO_PRIO(nice);
@@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice)
3075 p->prio = effective_prio(p); 3120 p->prio = effective_prio(p);
3076 delta = p->prio - old_prio; 3121 delta = p->prio - old_prio;
3077 3122
3078 if (on_rq) { 3123 if (queued) {
3079 enqueue_task(rq, p, 0); 3124 enqueue_task(rq, p, 0);
3080 /* 3125 /*
3081 * If the task increased its priority or is running and 3126 * If the task increased its priority or is running and
@@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p,
3347{ 3392{
3348 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3393 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3349 MAX_RT_PRIO - 1 - attr->sched_priority; 3394 MAX_RT_PRIO - 1 - attr->sched_priority;
3350 int retval, oldprio, oldpolicy = -1, on_rq, running; 3395 int retval, oldprio, oldpolicy = -1, queued, running;
3351 int policy = attr->sched_policy; 3396 int policy = attr->sched_policy;
3352 unsigned long flags; 3397 unsigned long flags;
3353 const struct sched_class *prev_class; 3398 const struct sched_class *prev_class;
@@ -3544,19 +3589,19 @@ change:
3544 return 0; 3589 return 0;
3545 } 3590 }
3546 3591
3547 on_rq = p->on_rq; 3592 queued = task_on_rq_queued(p);
3548 running = task_current(rq, p); 3593 running = task_current(rq, p);
3549 if (on_rq) 3594 if (queued)
3550 dequeue_task(rq, p, 0); 3595 dequeue_task(rq, p, 0);
3551 if (running) 3596 if (running)
3552 p->sched_class->put_prev_task(rq, p); 3597 put_prev_task(rq, p);
3553 3598
3554 prev_class = p->sched_class; 3599 prev_class = p->sched_class;
3555 __setscheduler(rq, p, attr); 3600 __setscheduler(rq, p, attr);
3556 3601
3557 if (running) 3602 if (running)
3558 p->sched_class->set_curr_task(rq); 3603 p->sched_class->set_curr_task(rq);
3559 if (on_rq) { 3604 if (queued) {
3560 /* 3605 /*
3561 * We enqueue to tail when the priority of a task is 3606 * We enqueue to tail when the priority of a task is
3562 * increased (user space view). 3607 * increased (user space view).
@@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3980 rcu_read_lock(); 4025 rcu_read_lock();
3981 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4026 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3982 rcu_read_unlock(); 4027 rcu_read_unlock();
3983 goto out_unlock; 4028 goto out_free_new_mask;
3984 } 4029 }
3985 rcu_read_unlock(); 4030 rcu_read_unlock();
3986 } 4031 }
3987 4032
3988 retval = security_task_setscheduler(p); 4033 retval = security_task_setscheduler(p);
3989 if (retval) 4034 if (retval)
3990 goto out_unlock; 4035 goto out_free_new_mask;
3991 4036
3992 4037
3993 cpuset_cpus_allowed(p, cpus_allowed); 4038 cpuset_cpus_allowed(p, cpus_allowed);
@@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4000 * root_domain. 4045 * root_domain.
4001 */ 4046 */
4002#ifdef CONFIG_SMP 4047#ifdef CONFIG_SMP
4003 if (task_has_dl_policy(p)) { 4048 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4004 const struct cpumask *span = task_rq(p)->rd->span; 4049 rcu_read_lock();
4005 4050 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4006 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
4007 retval = -EBUSY; 4051 retval = -EBUSY;
4008 goto out_unlock; 4052 rcu_read_unlock();
4053 goto out_free_new_mask;
4009 } 4054 }
4055 rcu_read_unlock();
4010 } 4056 }
4011#endif 4057#endif
4012again: 4058again:
@@ -4024,7 +4070,7 @@ again:
4024 goto again; 4070 goto again;
4025 } 4071 }
4026 } 4072 }
4027out_unlock: 4073out_free_new_mask:
4028 free_cpumask_var(new_mask); 4074 free_cpumask_var(new_mask);
4029out_free_cpus_allowed: 4075out_free_cpus_allowed:
4030 free_cpumask_var(cpus_allowed); 4076 free_cpumask_var(cpus_allowed);
@@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter)
4508 " task PC stack pid father\n"); 4554 " task PC stack pid father\n");
4509#endif 4555#endif
4510 rcu_read_lock(); 4556 rcu_read_lock();
4511 do_each_thread(g, p) { 4557 for_each_process_thread(g, p) {
4512 /* 4558 /*
4513 * reset the NMI-timeout, listing all files on a slow 4559 * reset the NMI-timeout, listing all files on a slow
4514 * console might take a lot of time: 4560 * console might take a lot of time:
@@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter)
4516 touch_nmi_watchdog(); 4562 touch_nmi_watchdog();
4517 if (!state_filter || (p->state & state_filter)) 4563 if (!state_filter || (p->state & state_filter))
4518 sched_show_task(p); 4564 sched_show_task(p);
4519 } while_each_thread(g, p); 4565 }
4520 4566
4521 touch_all_softlockup_watchdogs(); 4567 touch_all_softlockup_watchdogs();
4522 4568
@@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu)
4571 rcu_read_unlock(); 4617 rcu_read_unlock();
4572 4618
4573 rq->curr = rq->idle = idle; 4619 rq->curr = rq->idle = idle;
4574 idle->on_rq = 1; 4620 idle->on_rq = TASK_ON_RQ_QUEUED;
4575#if defined(CONFIG_SMP) 4621#if defined(CONFIG_SMP)
4576 idle->on_cpu = 1; 4622 idle->on_cpu = 1;
4577#endif 4623#endif
@@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu)
4592} 4638}
4593 4639
4594#ifdef CONFIG_SMP 4640#ifdef CONFIG_SMP
4641/*
4642 * move_queued_task - move a queued task to new rq.
4643 *
4644 * Returns (locked) new rq. Old rq's lock is released.
4645 */
4646static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4647{
4648 struct rq *rq = task_rq(p);
4649
4650 lockdep_assert_held(&rq->lock);
4651
4652 dequeue_task(rq, p, 0);
4653 p->on_rq = TASK_ON_RQ_MIGRATING;
4654 set_task_cpu(p, new_cpu);
4655 raw_spin_unlock(&rq->lock);
4656
4657 rq = cpu_rq(new_cpu);
4658
4659 raw_spin_lock(&rq->lock);
4660 BUG_ON(task_cpu(p) != new_cpu);
4661 p->on_rq = TASK_ON_RQ_QUEUED;
4662 enqueue_task(rq, p, 0);
4663 check_preempt_curr(rq, p, 0);
4664
4665 return rq;
4666}
4667
4595void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4668void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4596{ 4669{
4597 if (p->sched_class && p->sched_class->set_cpus_allowed) 4670 if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4648 goto out; 4721 goto out;
4649 4722
4650 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4723 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4651 if (p->on_rq) { 4724 if (task_running(rq, p) || p->state == TASK_WAKING) {
4652 struct migration_arg arg = { p, dest_cpu }; 4725 struct migration_arg arg = { p, dest_cpu };
4653 /* Need help from migration thread: drop lock and wait. */ 4726 /* Need help from migration thread: drop lock and wait. */
4654 task_rq_unlock(rq, p, &flags); 4727 task_rq_unlock(rq, p, &flags);
4655 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4728 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4656 tlb_migrate_finish(p->mm); 4729 tlb_migrate_finish(p->mm);
4657 return 0; 4730 return 0;
4658 } 4731 } else if (task_on_rq_queued(p))
4732 rq = move_queued_task(p, dest_cpu);
4659out: 4733out:
4660 task_rq_unlock(rq, p, &flags); 4734 task_rq_unlock(rq, p, &flags);
4661 4735
@@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4676 */ 4750 */
4677static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4751static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4678{ 4752{
4679 struct rq *rq_dest, *rq_src; 4753 struct rq *rq;
4680 int ret = 0; 4754 int ret = 0;
4681 4755
4682 if (unlikely(!cpu_active(dest_cpu))) 4756 if (unlikely(!cpu_active(dest_cpu)))
4683 return ret; 4757 return ret;
4684 4758
4685 rq_src = cpu_rq(src_cpu); 4759 rq = cpu_rq(src_cpu);
4686 rq_dest = cpu_rq(dest_cpu);
4687 4760
4688 raw_spin_lock(&p->pi_lock); 4761 raw_spin_lock(&p->pi_lock);
4689 double_rq_lock(rq_src, rq_dest); 4762 raw_spin_lock(&rq->lock);
4690 /* Already moved. */ 4763 /* Already moved. */
4691 if (task_cpu(p) != src_cpu) 4764 if (task_cpu(p) != src_cpu)
4692 goto done; 4765 goto done;
4766
4693 /* Affinity changed (again). */ 4767 /* Affinity changed (again). */
4694 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4768 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4695 goto fail; 4769 goto fail;
@@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4698 * If we're not on a rq, the next wake-up will ensure we're 4772 * If we're not on a rq, the next wake-up will ensure we're
4699 * placed properly. 4773 * placed properly.
4700 */ 4774 */
4701 if (p->on_rq) { 4775 if (task_on_rq_queued(p))
4702 dequeue_task(rq_src, p, 0); 4776 rq = move_queued_task(p, dest_cpu);
4703 set_task_cpu(p, dest_cpu);
4704 enqueue_task(rq_dest, p, 0);
4705 check_preempt_curr(rq_dest, p, 0);
4706 }
4707done: 4777done:
4708 ret = 1; 4778 ret = 1;
4709fail: 4779fail:
4710 double_rq_unlock(rq_src, rq_dest); 4780 raw_spin_unlock(&rq->lock);
4711 raw_spin_unlock(&p->pi_lock); 4781 raw_spin_unlock(&p->pi_lock);
4712 return ret; 4782 return ret;
4713} 4783}
@@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid)
4739{ 4809{
4740 struct rq *rq; 4810 struct rq *rq;
4741 unsigned long flags; 4811 unsigned long flags;
4742 bool on_rq, running; 4812 bool queued, running;
4743 4813
4744 rq = task_rq_lock(p, &flags); 4814 rq = task_rq_lock(p, &flags);
4745 on_rq = p->on_rq; 4815 queued = task_on_rq_queued(p);
4746 running = task_current(rq, p); 4816 running = task_current(rq, p);
4747 4817
4748 if (on_rq) 4818 if (queued)
4749 dequeue_task(rq, p, 0); 4819 dequeue_task(rq, p, 0);
4750 if (running) 4820 if (running)
4751 p->sched_class->put_prev_task(rq, p); 4821 put_prev_task(rq, p);
4752 4822
4753 p->numa_preferred_nid = nid; 4823 p->numa_preferred_nid = nid;
4754 4824
4755 if (running) 4825 if (running)
4756 p->sched_class->set_curr_task(rq); 4826 p->sched_class->set_curr_task(rq);
4757 if (on_rq) 4827 if (queued)
4758 enqueue_task(rq, p, 0); 4828 enqueue_task(rq, p, 0);
4759 task_rq_unlock(rq, p, &flags); 4829 task_rq_unlock(rq, p, &flags);
4760} 4830}
@@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data)
4774 * be on another cpu but it doesn't matter. 4844 * be on another cpu but it doesn't matter.
4775 */ 4845 */
4776 local_irq_disable(); 4846 local_irq_disable();
4847 /*
4848 * We need to explicitly wake pending tasks before running
4849 * __migrate_task() such that we will not miss enforcing cpus_allowed
4850 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
4851 */
4852 sched_ttwu_pending();
4777 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4853 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4778 local_irq_enable(); 4854 local_irq_enable();
4779 return 0; 4855 return 0;
@@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5184{ 5260{
5185 unsigned long flags; 5261 unsigned long flags;
5186 long cpu = (long)hcpu; 5262 long cpu = (long)hcpu;
5263 struct dl_bw *dl_b;
5187 5264
5188 switch (action & ~CPU_TASKS_FROZEN) { 5265 switch (action & ~CPU_TASKS_FROZEN) {
5189 case CPU_DOWN_PREPARE: 5266 case CPU_DOWN_PREPARE:
@@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5191 5268
5192 /* explicitly allow suspend */ 5269 /* explicitly allow suspend */
5193 if (!(action & CPU_TASKS_FROZEN)) { 5270 if (!(action & CPU_TASKS_FROZEN)) {
5194 struct dl_bw *dl_b = dl_bw_of(cpu);
5195 bool overflow; 5271 bool overflow;
5196 int cpus; 5272 int cpus;
5197 5273
5274 rcu_read_lock_sched();
5275 dl_b = dl_bw_of(cpu);
5276
5198 raw_spin_lock_irqsave(&dl_b->lock, flags); 5277 raw_spin_lock_irqsave(&dl_b->lock, flags);
5199 cpus = dl_bw_cpus(cpu); 5278 cpus = dl_bw_cpus(cpu);
5200 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5279 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5201 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5280 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5202 5281
5282 rcu_read_unlock_sched();
5283
5203 if (overflow) 5284 if (overflow)
5204 return notifier_from_errno(-EBUSY); 5285 return notifier_from_errno(-EBUSY);
5205 } 5286 }
@@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5742 const struct cpumask *span = sched_domain_span(sd); 5823 const struct cpumask *span = sched_domain_span(sd);
5743 struct cpumask *covered = sched_domains_tmpmask; 5824 struct cpumask *covered = sched_domains_tmpmask;
5744 struct sd_data *sdd = sd->private; 5825 struct sd_data *sdd = sd->private;
5745 struct sched_domain *child; 5826 struct sched_domain *sibling;
5746 int i; 5827 int i;
5747 5828
5748 cpumask_clear(covered); 5829 cpumask_clear(covered);
@@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5753 if (cpumask_test_cpu(i, covered)) 5834 if (cpumask_test_cpu(i, covered))
5754 continue; 5835 continue;
5755 5836
5756 child = *per_cpu_ptr(sdd->sd, i); 5837 sibling = *per_cpu_ptr(sdd->sd, i);
5757 5838
5758 /* See the comment near build_group_mask(). */ 5839 /* See the comment near build_group_mask(). */
5759 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5840 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5760 continue; 5841 continue;
5761 5842
5762 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5843 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5766 goto fail; 5847 goto fail;
5767 5848
5768 sg_span = sched_group_cpus(sg); 5849 sg_span = sched_group_cpus(sg);
5769 if (child->child) { 5850 if (sibling->child)
5770 child = child->child; 5851 cpumask_copy(sg_span, sched_domain_span(sibling->child));
5771 cpumask_copy(sg_span, sched_domain_span(child)); 5852 else
5772 } else
5773 cpumask_set_cpu(i, sg_span); 5853 cpumask_set_cpu(i, sg_span);
5774 5854
5775 cpumask_or(covered, covered, sg_span); 5855 cpumask_or(covered, covered, sg_span);
@@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7120 .sched_policy = SCHED_NORMAL, 7200 .sched_policy = SCHED_NORMAL,
7121 }; 7201 };
7122 int old_prio = p->prio; 7202 int old_prio = p->prio;
7123 int on_rq; 7203 int queued;
7124 7204
7125 on_rq = p->on_rq; 7205 queued = task_on_rq_queued(p);
7126 if (on_rq) 7206 if (queued)
7127 dequeue_task(rq, p, 0); 7207 dequeue_task(rq, p, 0);
7128 __setscheduler(rq, p, &attr); 7208 __setscheduler(rq, p, &attr);
7129 if (on_rq) { 7209 if (queued) {
7130 enqueue_task(rq, p, 0); 7210 enqueue_task(rq, p, 0);
7131 resched_curr(rq); 7211 resched_curr(rq);
7132 } 7212 }
@@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void)
7140 unsigned long flags; 7220 unsigned long flags;
7141 struct rq *rq; 7221 struct rq *rq;
7142 7222
7143 read_lock_irqsave(&tasklist_lock, flags); 7223 read_lock(&tasklist_lock);
7144 do_each_thread(g, p) { 7224 for_each_process_thread(g, p) {
7145 /* 7225 /*
7146 * Only normalize user tasks: 7226 * Only normalize user tasks:
7147 */ 7227 */
7148 if (!p->mm) 7228 if (p->flags & PF_KTHREAD)
7149 continue; 7229 continue;
7150 7230
7151 p->se.exec_start = 0; 7231 p->se.exec_start = 0;
@@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void)
7160 * Renice negative nice level userspace 7240 * Renice negative nice level userspace
7161 * tasks back to 0: 7241 * tasks back to 0:
7162 */ 7242 */
7163 if (task_nice(p) < 0 && p->mm) 7243 if (task_nice(p) < 0)
7164 set_user_nice(p, 0); 7244 set_user_nice(p, 0);
7165 continue; 7245 continue;
7166 } 7246 }
7167 7247
7168 raw_spin_lock(&p->pi_lock); 7248 rq = task_rq_lock(p, &flags);
7169 rq = __task_rq_lock(p);
7170
7171 normalize_task(rq, p); 7249 normalize_task(rq, p);
7172 7250 task_rq_unlock(rq, p, &flags);
7173 __task_rq_unlock(rq); 7251 }
7174 raw_spin_unlock(&p->pi_lock); 7252 read_unlock(&tasklist_lock);
7175 } while_each_thread(g, p);
7176
7177 read_unlock_irqrestore(&tasklist_lock, flags);
7178} 7253}
7179 7254
7180#endif /* CONFIG_MAGIC_SYSRQ */ 7255#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg)
7314void sched_move_task(struct task_struct *tsk) 7389void sched_move_task(struct task_struct *tsk)
7315{ 7390{
7316 struct task_group *tg; 7391 struct task_group *tg;
7317 int on_rq, running; 7392 int queued, running;
7318 unsigned long flags; 7393 unsigned long flags;
7319 struct rq *rq; 7394 struct rq *rq;
7320 7395
7321 rq = task_rq_lock(tsk, &flags); 7396 rq = task_rq_lock(tsk, &flags);
7322 7397
7323 running = task_current(rq, tsk); 7398 running = task_current(rq, tsk);
7324 on_rq = tsk->on_rq; 7399 queued = task_on_rq_queued(tsk);
7325 7400
7326 if (on_rq) 7401 if (queued)
7327 dequeue_task(rq, tsk, 0); 7402 dequeue_task(rq, tsk, 0);
7328 if (unlikely(running)) 7403 if (unlikely(running))
7329 tsk->sched_class->put_prev_task(rq, tsk); 7404 put_prev_task(rq, tsk);
7330 7405
7331 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7406 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7332 lockdep_is_held(&tsk->sighand->siglock)), 7407 lockdep_is_held(&tsk->sighand->siglock)),
@@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk)
7336 7411
7337#ifdef CONFIG_FAIR_GROUP_SCHED 7412#ifdef CONFIG_FAIR_GROUP_SCHED
7338 if (tsk->sched_class->task_move_group) 7413 if (tsk->sched_class->task_move_group)
7339 tsk->sched_class->task_move_group(tsk, on_rq); 7414 tsk->sched_class->task_move_group(tsk, queued);
7340 else 7415 else
7341#endif 7416#endif
7342 set_task_rq(tsk, task_cpu(tsk)); 7417 set_task_rq(tsk, task_cpu(tsk));
7343 7418
7344 if (unlikely(running)) 7419 if (unlikely(running))
7345 tsk->sched_class->set_curr_task(rq); 7420 tsk->sched_class->set_curr_task(rq);
7346 if (on_rq) 7421 if (queued)
7347 enqueue_task(rq, tsk, 0); 7422 enqueue_task(rq, tsk, 0);
7348 7423
7349 task_rq_unlock(rq, tsk, &flags); 7424 task_rq_unlock(rq, tsk, &flags);
@@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7361{ 7436{
7362 struct task_struct *g, *p; 7437 struct task_struct *g, *p;
7363 7438
7364 do_each_thread(g, p) { 7439 for_each_process_thread(g, p) {
7365 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7440 if (rt_task(p) && task_group(p) == tg)
7366 return 1; 7441 return 1;
7367 } while_each_thread(g, p); 7442 }
7368 7443
7369 return 0; 7444 return 0;
7370} 7445}
@@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void)
7573 u64 runtime = global_rt_runtime(); 7648 u64 runtime = global_rt_runtime();
7574 u64 period = global_rt_period(); 7649 u64 period = global_rt_period();
7575 u64 new_bw = to_ratio(period, runtime); 7650 u64 new_bw = to_ratio(period, runtime);
7651 struct dl_bw *dl_b;
7576 int cpu, ret = 0; 7652 int cpu, ret = 0;
7577 unsigned long flags; 7653 unsigned long flags;
7578 7654
@@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void)
7586 * solutions is welcome! 7662 * solutions is welcome!
7587 */ 7663 */
7588 for_each_possible_cpu(cpu) { 7664 for_each_possible_cpu(cpu) {
7589 struct dl_bw *dl_b = dl_bw_of(cpu); 7665 rcu_read_lock_sched();
7666 dl_b = dl_bw_of(cpu);
7590 7667
7591 raw_spin_lock_irqsave(&dl_b->lock, flags); 7668 raw_spin_lock_irqsave(&dl_b->lock, flags);
7592 if (new_bw < dl_b->total_bw) 7669 if (new_bw < dl_b->total_bw)
7593 ret = -EBUSY; 7670 ret = -EBUSY;
7594 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7671 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7595 7672
7673 rcu_read_unlock_sched();
7674
7596 if (ret) 7675 if (ret)
7597 break; 7676 break;
7598 } 7677 }
@@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void)
7603static void sched_dl_do_global(void) 7682static void sched_dl_do_global(void)
7604{ 7683{
7605 u64 new_bw = -1; 7684 u64 new_bw = -1;
7685 struct dl_bw *dl_b;
7606 int cpu; 7686 int cpu;
7607 unsigned long flags; 7687 unsigned long flags;
7608 7688
@@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void)
7616 * FIXME: As above... 7696 * FIXME: As above...
7617 */ 7697 */
7618 for_each_possible_cpu(cpu) { 7698 for_each_possible_cpu(cpu) {
7619 struct dl_bw *dl_b = dl_bw_of(cpu); 7699 rcu_read_lock_sched();
7700 dl_b = dl_bw_of(cpu);
7620 7701
7621 raw_spin_lock_irqsave(&dl_b->lock, flags); 7702 raw_spin_lock_irqsave(&dl_b->lock, flags);
7622 dl_b->bw = new_bw; 7703 dl_b->bw = new_bw;
7623 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7704 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7705
7706 rcu_read_unlock_sched();
7624 } 7707 }
7625} 7708}
7626 7709
@@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8001 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8084 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8002 8085
8003 quota = normalize_cfs_quota(tg, d); 8086 quota = normalize_cfs_quota(tg, d);
8004 parent_quota = parent_b->hierarchal_quota; 8087 parent_quota = parent_b->hierarchical_quota;
8005 8088
8006 /* 8089 /*
8007 * ensure max(child_quota) <= parent_quota, inherit when no 8090 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8012 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8095 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8013 return -EINVAL; 8096 return -EINVAL;
8014 } 8097 }
8015 cfs_b->hierarchal_quota = quota; 8098 cfs_b->hierarchical_quota = quota;
8016 8099
8017 return 0; 8100 return 0;
8018} 8101}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, cp->free_cpus, 110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
111 &p->cpus_allowed) && cpumask_and(later_mask,
112 later_mask, cpu_active_mask)) {
113 best_cpu = cpumask_any(later_mask); 111 best_cpu = cpumask_any(later_mask);
114 goto out; 112 goto out;
115 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
288 struct signal_struct *sig = tsk->signal; 288 struct signal_struct *sig = tsk->signal;
289 cputime_t utime, stime; 289 cputime_t utime, stime;
290 struct task_struct *t; 290 struct task_struct *t;
291 291 unsigned int seq, nextseq;
292 times->utime = sig->utime; 292 unsigned long flags;
293 times->stime = sig->stime;
294 times->sum_exec_runtime = sig->sum_sched_runtime;
295 293
296 rcu_read_lock(); 294 rcu_read_lock();
297 /* make sure we can trust tsk->thread_group list */ 295 /* Attempt a lockless read on the first round. */
298 if (!likely(pid_alive(tsk))) 296 nextseq = 0;
299 goto out;
300
301 t = tsk;
302 do { 297 do {
303 task_cputime(t, &utime, &stime); 298 seq = nextseq;
304 times->utime += utime; 299 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
305 times->stime += stime; 300 times->utime = sig->utime;
306 times->sum_exec_runtime += task_sched_runtime(t); 301 times->stime = sig->stime;
307 } while_each_thread(tsk, t); 302 times->sum_exec_runtime = sig->sum_sched_runtime;
308out: 303
304 for_each_thread(tsk, t) {
305 task_cputime(t, &utime, &stime);
306 times->utime += utime;
307 times->stime += stime;
308 times->sum_exec_runtime += task_sched_runtime(t);
309 }
310 /* If lockless access failed, take the lock. */
311 nextseq = 1;
312 } while (need_seqretry(&sig->stats_lock, seq));
313 done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
309 rcu_read_unlock(); 314 rcu_read_unlock();
310} 315}
311 316
@@ -550,6 +555,23 @@ drop_precision:
550} 555}
551 556
552/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu
559 * scheduling, and scaling inaccuracies can cause cputime_advance
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 *
563 * Normally a caller will only go through this loop once, or not
564 * at all in case a previous caller updated counter the same jiffy.
565 */
566static void cputime_advance(cputime_t *counter, cputime_t new)
567{
568 cputime_t old;
569
570 while (new > (old = ACCESS_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new);
572}
573
574/*
553 * Adjust tick based cputime random precision against scheduler 575 * Adjust tick based cputime random precision against scheduler
554 * runtime accounting. 576 * runtime accounting.
555 */ 577 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
594 utime = rtime - stime; 616 utime = rtime - stime;
595 } 617 }
596 618
597 /* 619 cputime_advance(&prev->stime, stime);
598 * If the tick based count grows faster than the scheduler one, 620 cputime_advance(&prev->utime, utime);
599 * the result of the scaling may go backward.
600 * Let's enforce monotonicity.
601 */
602 prev->stime = max(prev->stime, stime);
603 prev->utime = max(prev->utime, utime);
604 621
605out: 622out:
606 *ut = prev->utime; 623 *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
617 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 634 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
618} 635}
619 636
620/*
621 * Must be called with siglock held.
622 */
623void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 637void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
624{ 638{
625 struct task_cputime cputime; 639 struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..abfaf3d9a29f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ again:
530 update_rq_clock(rq); 530 update_rq_clock(rq);
531 dl_se->dl_throttled = 0; 531 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0; 532 dl_se->dl_yielded = 0;
533 if (p->on_rq) { 533 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
@@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
997#ifdef CONFIG_SCHED_HRTICK 997#ifdef CONFIG_SCHED_HRTICK
998static void start_hrtick_dl(struct rq *rq, struct task_struct *p) 998static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
999{ 999{
1000 s64 delta = p->dl.dl_runtime - p->dl.runtime; 1000 hrtick_start(rq, p->dl.runtime);
1001
1002 if (delta > 10000)
1003 hrtick_start(rq, p->dl.runtime);
1004} 1001}
1005#endif 1002#endif
1006 1003
@@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1030 * means a stop task can slip in, in which case we need to 1027 * means a stop task can slip in, in which case we need to
1031 * re-start task selection. 1028 * re-start task selection.
1032 */ 1029 */
1033 if (rq->stop && rq->stop->on_rq) 1030 if (rq->stop && task_on_rq_queued(rq->stop))
1034 return RETRY_TASK; 1031 return RETRY_TASK;
1035 } 1032 }
1036 1033
@@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq)
1124static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1121static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1125{ 1122{
1126 if (!task_running(rq, p) && 1123 if (!task_running(rq, p) &&
1127 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1124 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1128 (p->nr_cpus_allowed > 1))
1129 return 1; 1125 return 1;
1130
1131 return 0; 1126 return 0;
1132} 1127}
1133 1128
@@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task)
1169 if (task->nr_cpus_allowed == 1) 1164 if (task->nr_cpus_allowed == 1)
1170 return -1; 1165 return -1;
1171 1166
1167 /*
1168 * We have to consider system topology and task affinity
1169 * first, then we can look for a suitable cpu.
1170 */
1171 cpumask_copy(later_mask, task_rq(task)->rd->span);
1172 cpumask_and(later_mask, later_mask, cpu_active_mask);
1173 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1174 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1175 task, later_mask);
1174 if (best_cpu == -1) 1176 if (best_cpu == -1)
@@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1257 if (unlikely(task_rq(task) != rq || 1259 if (unlikely(task_rq(task) != rq ||
1258 !cpumask_test_cpu(later_rq->cpu, 1260 !cpumask_test_cpu(later_rq->cpu,
1259 &task->cpus_allowed) || 1261 &task->cpus_allowed) ||
1260 task_running(rq, task) || !task->on_rq)) { 1262 task_running(rq, task) ||
1263 !task_on_rq_queued(task))) {
1261 double_unlock_balance(rq, later_rq); 1264 double_unlock_balance(rq, later_rq);
1262 later_rq = NULL; 1265 later_rq = NULL;
1263 break; 1266 break;
@@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1296 BUG_ON(task_current(rq, p)); 1299 BUG_ON(task_current(rq, p));
1297 BUG_ON(p->nr_cpus_allowed <= 1); 1300 BUG_ON(p->nr_cpus_allowed <= 1);
1298 1301
1299 BUG_ON(!p->on_rq); 1302 BUG_ON(!task_on_rq_queued(p));
1300 BUG_ON(!dl_task(p)); 1303 BUG_ON(!dl_task(p));
1301 1304
1302 return p; 1305 return p;
@@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq)
1443 dl_time_before(p->dl.deadline, 1446 dl_time_before(p->dl.deadline,
1444 this_rq->dl.earliest_dl.curr))) { 1447 this_rq->dl.earliest_dl.curr))) {
1445 WARN_ON(p == src_rq->curr); 1448 WARN_ON(p == src_rq->curr);
1446 WARN_ON(!p->on_rq); 1449 WARN_ON(!task_on_rq_queued(p));
1447 1450
1448 /* 1451 /*
1449 * Then we pull iff p has actually an earlier 1452 * Then we pull iff p has actually an earlier
@@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1569 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1572 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1570 hrtimer_try_to_cancel(&p->dl.dl_timer); 1573 hrtimer_try_to_cancel(&p->dl.dl_timer);
1571 1574
1575 __dl_clear_params(p);
1576
1572#ifdef CONFIG_SMP 1577#ifdef CONFIG_SMP
1573 /* 1578 /*
1574 * Since this might be the only -deadline task on the rq, 1579 * Since this might be the only -deadline task on the rq,
@@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1596 if (unlikely(p->dl.dl_throttled)) 1601 if (unlikely(p->dl.dl_throttled))
1597 return; 1602 return;
1598 1603
1599 if (p->on_rq && rq->curr != p) { 1604 if (task_on_rq_queued(p) && rq->curr != p) {
1600#ifdef CONFIG_SMP 1605#ifdef CONFIG_SMP
1601 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1606 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1602 /* Only reschedule if pushing failed */ 1607 /* Only reschedule if pushing failed */
@@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1614static void prio_changed_dl(struct rq *rq, struct task_struct *p, 1619static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1615 int oldprio) 1620 int oldprio)
1616{ 1621{
1617 if (p->on_rq || rq->curr == p) { 1622 if (task_on_rq_queued(p) || rq->curr == p) {
1618#ifdef CONFIG_SMP 1623#ifdef CONFIG_SMP
1619 /* 1624 /*
1620 * This might be too much, but unfortunately 1625 * This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..ce33780d8f20 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
151{ 151{
152 struct task_struct *g, *p; 152 struct task_struct *g, *p;
153 unsigned long flags;
154 153
155 SEQ_printf(m, 154 SEQ_printf(m,
156 "\nrunnable tasks:\n" 155 "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 "------------------------------------------------------" 158 "------------------------------------------------------"
160 "----------------------------------------------------\n"); 159 "----------------------------------------------------\n");
161 160
162 read_lock_irqsave(&tasklist_lock, flags); 161 rcu_read_lock();
163 162 for_each_process_thread(g, p) {
164 do_each_thread(g, p) {
165 if (task_cpu(p) != rq_cpu) 163 if (task_cpu(p) != rq_cpu)
166 continue; 164 continue;
167 165
168 print_task(m, rq, p); 166 print_task(m, rq, p);
169 } while_each_thread(g, p); 167 }
170 168 rcu_read_unlock();
171 read_unlock_irqrestore(&tasklist_lock, flags);
172} 169}
173 170
174void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 171void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -333,9 +330,7 @@ do { \
333 print_cfs_stats(m, cpu); 330 print_cfs_stats(m, cpu);
334 print_rt_stats(m, cpu); 331 print_rt_stats(m, cpu);
335 332
336 rcu_read_lock();
337 print_rq(m, rq, cpu); 333 print_rq(m, rq, cpu);
338 rcu_read_unlock();
339 spin_unlock_irqrestore(&sched_debug_lock, flags); 334 spin_unlock_irqrestore(&sched_debug_lock, flags);
340 SEQ_printf(m, "\n"); 335 SEQ_printf(m, "\n");
341} 336}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82088b29704e..b78280c59b46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/profile.h> 28#include <linux/profile.h>
28#include <linux/interrupt.h> 29#include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665} 666}
666 667
667#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
669static int select_idle_sibling(struct task_struct *p, int cpu);
668static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
669 671
670static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1038,7 +1040,8 @@ struct numa_stats {
1038 */ 1040 */
1039static void update_numa_stats(struct numa_stats *ns, int nid) 1041static void update_numa_stats(struct numa_stats *ns, int nid)
1040{ 1042{
1041 int cpu, cpus = 0; 1043 int smt, cpu, cpus = 0;
1044 unsigned long capacity;
1042 1045
1043 memset(ns, 0, sizeof(*ns)); 1046 memset(ns, 0, sizeof(*ns));
1044 for_each_cpu(cpu, cpumask_of_node(nid)) { 1047 for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1065 if (!cpus)
1063 return; 1066 return;
1064 1067
1065 ns->task_capacity = 1068 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1069 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1070 capacity = cpus / smt; /* cores */
1071
1072 ns->task_capacity = min_t(unsigned, capacity,
1073 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1074 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1075}
1069 1076
@@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env,
1206 1213
1207 if (!cur) { 1214 if (!cur) {
1208 /* Is there capacity at our destination? */ 1215 /* Is there capacity at our destination? */
1209 if (env->src_stats.has_free_capacity && 1216 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1210 !env->dst_stats.has_free_capacity) 1217 !env->dst_stats.has_free_capacity)
1211 goto unlock; 1218 goto unlock;
1212 1219
@@ -1252,6 +1259,13 @@ balance:
1252 if (load_too_imbalanced(src_load, dst_load, env)) 1259 if (load_too_imbalanced(src_load, dst_load, env))
1253 goto unlock; 1260 goto unlock;
1254 1261
1262 /*
1263 * One idle CPU per node is evaluated for a task numa move.
1264 * Call select_idle_sibling to maybe find a better one.
1265 */
1266 if (!cur)
1267 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1268
1255assign: 1269assign:
1256 task_numa_assign(env, cur, imp); 1270 task_numa_assign(env, cur, imp);
1257unlock: 1271unlock:
@@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p)
1775 list_del(&p->numa_entry); 1789 list_del(&p->numa_entry);
1776 grp->nr_tasks--; 1790 grp->nr_tasks--;
1777 spin_unlock_irqrestore(&grp->lock, flags); 1791 spin_unlock_irqrestore(&grp->lock, flags);
1778 rcu_assign_pointer(p->numa_group, NULL); 1792 RCU_INIT_POINTER(p->numa_group, NULL);
1779 put_numa_group(grp); 1793 put_numa_group(grp);
1780 } 1794 }
1781 1795
@@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1804 if (!p->mm) 1818 if (!p->mm)
1805 return; 1819 return;
1806 1820
1807 /* Do not worry about placement if exiting */
1808 if (p->state == TASK_DEAD)
1809 return;
1810
1811 /* Allocate buffer to track faults on a per-node basis */ 1821 /* Allocate buffer to track faults on a per-node basis */
1812 if (unlikely(!p->numa_faults_memory)) { 1822 if (unlikely(!p->numa_faults_memory)) {
1813 int size = sizeof(*p->numa_faults_memory) * 1823 int size = sizeof(*p->numa_faults_memory) *
@@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2211 2221
2212 /* 2222 /*
2213 * As y^PERIOD = 1/2, we can combine 2223 * As y^PERIOD = 1/2, we can combine
2214 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) 2224 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2215 * With a look-up table which covers k^n (n<PERIOD) 2225 * With a look-up table which covers y^n (n<PERIOD)
2216 * 2226 *
2217 * To achieve constant time decay_load. 2227 * To achieve constant time decay_load.
2218 */ 2228 */
@@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2377 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 2387 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2378 tg_contrib -= cfs_rq->tg_load_contrib; 2388 tg_contrib -= cfs_rq->tg_load_contrib;
2379 2389
2390 if (!tg_contrib)
2391 return;
2392
2380 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 2393 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2381 atomic_long_add(tg_contrib, &tg->load_avg); 2394 atomic_long_add(tg_contrib, &tg->load_avg);
2382 cfs_rq->tg_load_contrib += tg_contrib; 2395 cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3892 resched_curr(rq); 3905 resched_curr(rq);
3893 return; 3906 return;
3894 } 3907 }
3895
3896 /*
3897 * Don't schedule slices shorter than 10000ns, that just
3898 * doesn't make sense. Rely on vruntime for fairness.
3899 */
3900 if (rq->curr != p)
3901 delta = max_t(s64, 10000LL, delta);
3902
3903 hrtick_start(rq, delta); 3908 hrtick_start(rq, delta);
3904 } 3909 }
3905} 3910}
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu)
4087static unsigned long cpu_avg_load_per_task(int cpu) 4092static unsigned long cpu_avg_load_per_task(int cpu)
4088{ 4093{
4089 struct rq *rq = cpu_rq(cpu); 4094 struct rq *rq = cpu_rq(cpu);
4090 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 4095 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
4091 unsigned long load_avg = rq->cfs.runnable_load_avg; 4096 unsigned long load_avg = rq->cfs.runnable_load_avg;
4092 4097
4093 if (nr_running) 4098 if (nr_running)
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p)
4276static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 4281static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4277{ 4282{
4278 s64 this_load, load; 4283 s64 this_load, load;
4284 s64 this_eff_load, prev_eff_load;
4279 int idx, this_cpu, prev_cpu; 4285 int idx, this_cpu, prev_cpu;
4280 unsigned long tl_per_task;
4281 struct task_group *tg; 4286 struct task_group *tg;
4282 unsigned long weight; 4287 unsigned long weight;
4283 int balanced; 4288 int balanced;
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4320 * Otherwise check if either cpus are near enough in load to allow this 4325 * Otherwise check if either cpus are near enough in load to allow this
4321 * task to be woken on this_cpu. 4326 * task to be woken on this_cpu.
4322 */ 4327 */
4323 if (this_load > 0) { 4328 this_eff_load = 100;
4324 s64 this_eff_load, prev_eff_load; 4329 this_eff_load *= capacity_of(prev_cpu);
4325 4330
4326 this_eff_load = 100; 4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4327 this_eff_load *= capacity_of(prev_cpu); 4332 prev_eff_load *= capacity_of(this_cpu);
4333
4334 if (this_load > 0) {
4328 this_eff_load *= this_load + 4335 this_eff_load *= this_load +
4329 effective_load(tg, this_cpu, weight, weight); 4336 effective_load(tg, this_cpu, weight, weight);
4330 4337
4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4332 prev_eff_load *= capacity_of(this_cpu);
4333 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4338 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4339 }
4334 4340
4335 balanced = this_eff_load <= prev_eff_load; 4341 balanced = this_eff_load <= prev_eff_load;
4336 } else
4337 balanced = true;
4338
4339 /*
4340 * If the currently running task will sleep within
4341 * a reasonable amount of time then attract this newly
4342 * woken task:
4343 */
4344 if (sync && balanced)
4345 return 1;
4346 4342
4347 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 4343 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4348 tl_per_task = cpu_avg_load_per_task(this_cpu);
4349 4344
4350 if (balanced || 4345 if (!balanced)
4351 (this_load <= load && 4346 return 0;
4352 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4353 /*
4354 * This domain has SD_WAKE_AFFINE and
4355 * p is cache cold in this domain, and
4356 * there is no bad imbalance.
4357 */
4358 schedstat_inc(sd, ttwu_move_affine);
4359 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4360 4347
4361 return 1; 4348 schedstat_inc(sd, ttwu_move_affine);
4362 } 4349 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4363 return 0; 4350
4351 return 1;
4364} 4352}
4365 4353
4366/* 4354/*
@@ -4428,20 +4416,46 @@ static int
4428find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 4416find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4429{ 4417{
4430 unsigned long load, min_load = ULONG_MAX; 4418 unsigned long load, min_load = ULONG_MAX;
4431 int idlest = -1; 4419 unsigned int min_exit_latency = UINT_MAX;
4420 u64 latest_idle_timestamp = 0;
4421 int least_loaded_cpu = this_cpu;
4422 int shallowest_idle_cpu = -1;
4432 int i; 4423 int i;
4433 4424
4434 /* Traverse only the allowed CPUs */ 4425 /* Traverse only the allowed CPUs */
4435 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 4426 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4436 load = weighted_cpuload(i); 4427 if (idle_cpu(i)) {
4437 4428 struct rq *rq = cpu_rq(i);
4438 if (load < min_load || (load == min_load && i == this_cpu)) { 4429 struct cpuidle_state *idle = idle_get_state(rq);
4439 min_load = load; 4430 if (idle && idle->exit_latency < min_exit_latency) {
4440 idlest = i; 4431 /*
4432 * We give priority to a CPU whose idle state
4433 * has the smallest exit latency irrespective
4434 * of any idle timestamp.
4435 */
4436 min_exit_latency = idle->exit_latency;
4437 latest_idle_timestamp = rq->idle_stamp;
4438 shallowest_idle_cpu = i;
4439 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4440 rq->idle_stamp > latest_idle_timestamp) {
4441 /*
4442 * If equal or no active idle state, then
4443 * the most recently idled CPU might have
4444 * a warmer cache.
4445 */
4446 latest_idle_timestamp = rq->idle_stamp;
4447 shallowest_idle_cpu = i;
4448 }
4449 } else {
4450 load = weighted_cpuload(i);
4451 if (load < min_load || (load == min_load && i == this_cpu)) {
4452 min_load = load;
4453 least_loaded_cpu = i;
4454 }
4441 } 4455 }
4442 } 4456 }
4443 4457
4444 return idlest; 4458 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4445} 4459}
4446 4460
4447/* 4461/*
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4513 if (p->nr_cpus_allowed == 1) 4527 if (p->nr_cpus_allowed == 1)
4514 return prev_cpu; 4528 return prev_cpu;
4515 4529
4516 if (sd_flag & SD_BALANCE_WAKE) { 4530 if (sd_flag & SD_BALANCE_WAKE)
4517 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 4531 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4518 want_affine = 1;
4519 new_cpu = prev_cpu;
4520 }
4521 4532
4522 rcu_read_lock(); 4533 rcu_read_lock();
4523 for_each_domain(cpu, tmp) { 4534 for_each_domain(cpu, tmp) {
@@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4704 return; 4715 return;
4705 4716
4706 /* 4717 /*
4707 * This is possible from callers such as move_task(), in which we 4718 * This is possible from callers such as attach_tasks(), in which we
4708 * unconditionally check_prempt_curr() after an enqueue (which may have 4719 * unconditionally check_prempt_curr() after an enqueue (which may have
4709 * lead to a throttle). This both saves work and prevents false 4720 * lead to a throttle). This both saves work and prevents false
4710 * next-buddy nomination below. 4721 * next-buddy nomination below.
@@ -5112,27 +5123,18 @@ struct lb_env {
5112 unsigned int loop_max; 5123 unsigned int loop_max;
5113 5124
5114 enum fbq_type fbq_type; 5125 enum fbq_type fbq_type;
5126 struct list_head tasks;
5115}; 5127};
5116 5128
5117/* 5129/*
5118 * move_task - move a task from one runqueue to another runqueue.
5119 * Both runqueues must be locked.
5120 */
5121static void move_task(struct task_struct *p, struct lb_env *env)
5122{
5123 deactivate_task(env->src_rq, p, 0);
5124 set_task_cpu(p, env->dst_cpu);
5125 activate_task(env->dst_rq, p, 0);
5126 check_preempt_curr(env->dst_rq, p, 0);
5127}
5128
5129/*
5130 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5131 */ 5131 */
5132static int task_hot(struct task_struct *p, struct lb_env *env) 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5133{ 5133{
5134 s64 delta; 5134 s64 delta;
5135 5135
5136 lockdep_assert_held(&env->src_rq->lock);
5137
5136 if (p->sched_class != &fair_sched_class) 5138 if (p->sched_class != &fair_sched_class)
5137 return 0; 5139 return 0;
5138 5140
@@ -5252,6 +5254,9 @@ static
5252int can_migrate_task(struct task_struct *p, struct lb_env *env) 5254int can_migrate_task(struct task_struct *p, struct lb_env *env)
5253{ 5255{
5254 int tsk_cache_hot = 0; 5256 int tsk_cache_hot = 0;
5257
5258 lockdep_assert_held(&env->src_rq->lock);
5259
5255 /* 5260 /*
5256 * We do not migrate tasks that are: 5261 * We do not migrate tasks that are:
5257 * 1) throttled_lb_pair, or 5262 * 1) throttled_lb_pair, or
@@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5310 if (!tsk_cache_hot) 5315 if (!tsk_cache_hot)
5311 tsk_cache_hot = migrate_degrades_locality(p, env); 5316 tsk_cache_hot = migrate_degrades_locality(p, env);
5312 5317
5313 if (migrate_improves_locality(p, env)) { 5318 if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5314#ifdef CONFIG_SCHEDSTATS 5319 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5315 if (tsk_cache_hot) { 5320 if (tsk_cache_hot) {
5316 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 5321 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5317 schedstat_inc(p, se.statistics.nr_forced_migrations); 5322 schedstat_inc(p, se.statistics.nr_forced_migrations);
5318 } 5323 }
5319#endif
5320 return 1;
5321 }
5322
5323 if (!tsk_cache_hot ||
5324 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5325
5326 if (tsk_cache_hot) {
5327 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5328 schedstat_inc(p, se.statistics.nr_forced_migrations);
5329 }
5330
5331 return 1; 5324 return 1;
5332 } 5325 }
5333 5326
@@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5336} 5329}
5337 5330
5338/* 5331/*
5339 * move_one_task tries to move exactly one task from busiest to this_rq, as 5332 * detach_task() -- detach the task for the migration specified in env
5333 */
5334static void detach_task(struct task_struct *p, struct lb_env *env)
5335{
5336 lockdep_assert_held(&env->src_rq->lock);
5337
5338 deactivate_task(env->src_rq, p, 0);
5339 p->on_rq = TASK_ON_RQ_MIGRATING;
5340 set_task_cpu(p, env->dst_cpu);
5341}
5342
5343/*
5344 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5340 * part of active balancing operations within "domain". 5345 * part of active balancing operations within "domain".
5341 * Returns 1 if successful and 0 otherwise.
5342 * 5346 *
5343 * Called with both runqueues locked. 5347 * Returns a task if successful and NULL otherwise.
5344 */ 5348 */
5345static int move_one_task(struct lb_env *env) 5349static struct task_struct *detach_one_task(struct lb_env *env)
5346{ 5350{
5347 struct task_struct *p, *n; 5351 struct task_struct *p, *n;
5348 5352
5353 lockdep_assert_held(&env->src_rq->lock);
5354
5349 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 5355 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5350 if (!can_migrate_task(p, env)) 5356 if (!can_migrate_task(p, env))
5351 continue; 5357 continue;
5352 5358
5353 move_task(p, env); 5359 detach_task(p, env);
5360
5354 /* 5361 /*
5355 * Right now, this is only the second place move_task() 5362 * Right now, this is only the second place where
5356 * is called, so we can safely collect move_task() 5363 * lb_gained[env->idle] is updated (other is detach_tasks)
5357 * stats here rather than inside move_task(). 5364 * so we can safely collect stats here rather than
5365 * inside detach_tasks().
5358 */ 5366 */
5359 schedstat_inc(env->sd, lb_gained[env->idle]); 5367 schedstat_inc(env->sd, lb_gained[env->idle]);
5360 return 1; 5368 return p;
5361 } 5369 }
5362 return 0; 5370 return NULL;
5363} 5371}
5364 5372
5365static const unsigned int sched_nr_migrate_break = 32; 5373static const unsigned int sched_nr_migrate_break = 32;
5366 5374
5367/* 5375/*
5368 * move_tasks tries to move up to imbalance weighted load from busiest to 5376 * detach_tasks() -- tries to detach up to imbalance weighted load from
5369 * this_rq, as part of a balancing operation within domain "sd". 5377 * busiest_rq, as part of a balancing operation within domain "sd".
5370 * Returns 1 if successful and 0 otherwise.
5371 * 5378 *
5372 * Called with both runqueues locked. 5379 * Returns number of detached tasks if successful and 0 otherwise.
5373 */ 5380 */
5374static int move_tasks(struct lb_env *env) 5381static int detach_tasks(struct lb_env *env)
5375{ 5382{
5376 struct list_head *tasks = &env->src_rq->cfs_tasks; 5383 struct list_head *tasks = &env->src_rq->cfs_tasks;
5377 struct task_struct *p; 5384 struct task_struct *p;
5378 unsigned long load; 5385 unsigned long load;
5379 int pulled = 0; 5386 int detached = 0;
5387
5388 lockdep_assert_held(&env->src_rq->lock);
5380 5389
5381 if (env->imbalance <= 0) 5390 if (env->imbalance <= 0)
5382 return 0; 5391 return 0;
@@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env)
5407 if ((load / 2) > env->imbalance) 5416 if ((load / 2) > env->imbalance)
5408 goto next; 5417 goto next;
5409 5418
5410 move_task(p, env); 5419 detach_task(p, env);
5411 pulled++; 5420 list_add(&p->se.group_node, &env->tasks);
5421
5422 detached++;
5412 env->imbalance -= load; 5423 env->imbalance -= load;
5413 5424
5414#ifdef CONFIG_PREEMPT 5425#ifdef CONFIG_PREEMPT
5415 /* 5426 /*
5416 * NEWIDLE balancing is a source of latency, so preemptible 5427 * NEWIDLE balancing is a source of latency, so preemptible
5417 * kernels will stop after the first task is pulled to minimize 5428 * kernels will stop after the first task is detached to minimize
5418 * the critical section. 5429 * the critical section.
5419 */ 5430 */
5420 if (env->idle == CPU_NEWLY_IDLE) 5431 if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5445,58 @@ next:
5434 } 5445 }
5435 5446
5436 /* 5447 /*
5437 * Right now, this is one of only two places move_task() is called, 5448 * Right now, this is one of only two places we collect this stat
5438 * so we can safely collect move_task() stats here rather than 5449 * so we can safely collect detach_one_task() stats here rather
5439 * inside move_task(). 5450 * than inside detach_one_task().
5440 */ 5451 */
5441 schedstat_add(env->sd, lb_gained[env->idle], pulled); 5452 schedstat_add(env->sd, lb_gained[env->idle], detached);
5453
5454 return detached;
5455}
5456
5457/*
5458 * attach_task() -- attach the task detached by detach_task() to its new rq.
5459 */
5460static void attach_task(struct rq *rq, struct task_struct *p)
5461{
5462 lockdep_assert_held(&rq->lock);
5463
5464 BUG_ON(task_rq(p) != rq);
5465 p->on_rq = TASK_ON_RQ_QUEUED;
5466 activate_task(rq, p, 0);
5467 check_preempt_curr(rq, p, 0);
5468}
5469
5470/*
5471 * attach_one_task() -- attaches the task returned from detach_one_task() to
5472 * its new rq.
5473 */
5474static void attach_one_task(struct rq *rq, struct task_struct *p)
5475{
5476 raw_spin_lock(&rq->lock);
5477 attach_task(rq, p);
5478 raw_spin_unlock(&rq->lock);
5479}
5480
5481/*
5482 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5483 * new rq.
5484 */
5485static void attach_tasks(struct lb_env *env)
5486{
5487 struct list_head *tasks = &env->tasks;
5488 struct task_struct *p;
5489
5490 raw_spin_lock(&env->dst_rq->lock);
5491
5492 while (!list_empty(tasks)) {
5493 p = list_first_entry(tasks, struct task_struct, se.group_node);
5494 list_del_init(&p->se.group_node);
5442 5495
5443 return pulled; 5496 attach_task(env->dst_rq, p);
5497 }
5498
5499 raw_spin_unlock(&env->dst_rq->lock);
5444} 5500}
5445 5501
5446#ifdef CONFIG_FAIR_GROUP_SCHED 5502#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p)
5559#endif 5615#endif
5560 5616
5561/********** Helpers for find_busiest_group ************************/ 5617/********** Helpers for find_busiest_group ************************/
5618
5619enum group_type {
5620 group_other = 0,
5621 group_imbalanced,
5622 group_overloaded,
5623};
5624
5562/* 5625/*
5563 * sg_lb_stats - stats of a sched_group required for load_balancing 5626 * sg_lb_stats - stats of a sched_group required for load_balancing
5564 */ 5627 */
@@ -5572,7 +5635,7 @@ struct sg_lb_stats {
5572 unsigned int group_capacity_factor; 5635 unsigned int group_capacity_factor;
5573 unsigned int idle_cpus; 5636 unsigned int idle_cpus;
5574 unsigned int group_weight; 5637 unsigned int group_weight;
5575 int group_imb; /* Is there an imbalance in the group ? */ 5638 enum group_type group_type;
5576 int group_has_free_capacity; 5639 int group_has_free_capacity;
5577#ifdef CONFIG_NUMA_BALANCING 5640#ifdef CONFIG_NUMA_BALANCING
5578 unsigned int nr_numa_running; 5641 unsigned int nr_numa_running;
@@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5610 .total_capacity = 0UL, 5673 .total_capacity = 0UL,
5611 .busiest_stat = { 5674 .busiest_stat = {
5612 .avg_load = 0UL, 5675 .avg_load = 0UL,
5676 .sum_nr_running = 0,
5677 .group_type = group_other,
5613 }, 5678 },
5614 }; 5679 };
5615} 5680}
@@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5652 return default_scale_capacity(sd, cpu); 5717 return default_scale_capacity(sd, cpu);
5653} 5718}
5654 5719
5655static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) 5720static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5656{ 5721{
5657 unsigned long weight = sd->span_weight; 5722 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
5658 unsigned long smt_gain = sd->smt_gain; 5723 return sd->smt_gain / sd->span_weight;
5659 5724
5660 smt_gain /= weight; 5725 return SCHED_CAPACITY_SCALE;
5661
5662 return smt_gain;
5663} 5726}
5664 5727
5665unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) 5728unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5666{ 5729{
5667 return default_scale_smt_capacity(sd, cpu); 5730 return default_scale_cpu_capacity(sd, cpu);
5668} 5731}
5669 5732
5670static unsigned long scale_rt_capacity(int cpu) 5733static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu)
5703 5766
5704static void update_cpu_capacity(struct sched_domain *sd, int cpu) 5767static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5705{ 5768{
5706 unsigned long weight = sd->span_weight;
5707 unsigned long capacity = SCHED_CAPACITY_SCALE; 5769 unsigned long capacity = SCHED_CAPACITY_SCALE;
5708 struct sched_group *sdg = sd->groups; 5770 struct sched_group *sdg = sd->groups;
5709 5771
5710 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { 5772 if (sched_feat(ARCH_CAPACITY))
5711 if (sched_feat(ARCH_CAPACITY)) 5773 capacity *= arch_scale_cpu_capacity(sd, cpu);
5712 capacity *= arch_scale_smt_capacity(sd, cpu); 5774 else
5713 else 5775 capacity *= default_scale_cpu_capacity(sd, cpu);
5714 capacity *= default_scale_smt_capacity(sd, cpu);
5715 5776
5716 capacity >>= SCHED_CAPACITY_SHIFT; 5777 capacity >>= SCHED_CAPACITY_SHIFT;
5717 }
5718 5778
5719 sdg->sgc->capacity_orig = capacity; 5779 sdg->sgc->capacity_orig = capacity;
5720 5780
@@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5891 return capacity_factor; 5951 return capacity_factor;
5892} 5952}
5893 5953
5954static enum group_type
5955group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
5956{
5957 if (sgs->sum_nr_running > sgs->group_capacity_factor)
5958 return group_overloaded;
5959
5960 if (sg_imbalanced(group))
5961 return group_imbalanced;
5962
5963 return group_other;
5964}
5965
5894/** 5966/**
5895 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 5967 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5896 * @env: The load balancing environment. 5968 * @env: The load balancing environment.
@@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5920 load = source_load(i, load_idx); 5992 load = source_load(i, load_idx);
5921 5993
5922 sgs->group_load += load; 5994 sgs->group_load += load;
5923 sgs->sum_nr_running += rq->nr_running; 5995 sgs->sum_nr_running += rq->cfs.h_nr_running;
5924 5996
5925 if (rq->nr_running > 1) 5997 if (rq->nr_running > 1)
5926 *overload = true; 5998 *overload = true;
@@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5942 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6014 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5943 6015
5944 sgs->group_weight = group->group_weight; 6016 sgs->group_weight = group->group_weight;
5945
5946 sgs->group_imb = sg_imbalanced(group);
5947 sgs->group_capacity_factor = sg_capacity_factor(env, group); 6017 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6018 sgs->group_type = group_classify(group, sgs);
5948 6019
5949 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6020 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5950 sgs->group_has_free_capacity = 1; 6021 sgs->group_has_free_capacity = 1;
@@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5968 struct sched_group *sg, 6039 struct sched_group *sg,
5969 struct sg_lb_stats *sgs) 6040 struct sg_lb_stats *sgs)
5970{ 6041{
5971 if (sgs->avg_load <= sds->busiest_stat.avg_load) 6042 struct sg_lb_stats *busiest = &sds->busiest_stat;
5972 return false;
5973 6043
5974 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6044 if (sgs->group_type > busiest->group_type)
5975 return true; 6045 return true;
5976 6046
5977 if (sgs->group_imb) 6047 if (sgs->group_type < busiest->group_type)
6048 return false;
6049
6050 if (sgs->avg_load <= busiest->avg_load)
6051 return false;
6052
6053 /* This is the busiest node in its class. */
6054 if (!(env->sd->flags & SD_ASYM_PACKING))
5978 return true; 6055 return true;
5979 6056
5980 /* 6057 /*
@@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5982 * numbered CPUs in the group, therefore mark all groups 6059 * numbered CPUs in the group, therefore mark all groups
5983 * higher than ourself as busy. 6060 * higher than ourself as busy.
5984 */ 6061 */
5985 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 6062 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
5986 env->dst_cpu < group_first_cpu(sg)) {
5987 if (!sds->busiest) 6063 if (!sds->busiest)
5988 return true; 6064 return true;
5989 6065
@@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6228 local = &sds->local_stat; 6304 local = &sds->local_stat;
6229 busiest = &sds->busiest_stat; 6305 busiest = &sds->busiest_stat;
6230 6306
6231 if (busiest->group_imb) { 6307 if (busiest->group_type == group_imbalanced) {
6232 /* 6308 /*
6233 * In the group_imb case we cannot rely on group-wide averages 6309 * In the group_imb case we cannot rely on group-wide averages
6234 * to ensure cpu-load equilibrium, look at wider averages. XXX 6310 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6248 return fix_small_imbalance(env, sds); 6324 return fix_small_imbalance(env, sds);
6249 } 6325 }
6250 6326
6251 if (!busiest->group_imb) { 6327 /*
6252 /* 6328 * If there aren't any idle cpus, avoid creating some.
6253 * Don't want to pull so many tasks that a group would go idle. 6329 */
6254 * Except of course for the group_imb case, since then we might 6330 if (busiest->group_type == group_overloaded &&
6255 * have to drop below capacity to reach cpu-load equilibrium. 6331 local->group_type == group_overloaded) {
6256 */
6257 load_above_capacity = 6332 load_above_capacity =
6258 (busiest->sum_nr_running - busiest->group_capacity_factor); 6333 (busiest->sum_nr_running - busiest->group_capacity_factor);
6259 6334
@@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6337 * work because they assume all things are equal, which typically 6412 * work because they assume all things are equal, which typically
6338 * isn't true due to cpus_allowed constraints and the like. 6413 * isn't true due to cpus_allowed constraints and the like.
6339 */ 6414 */
6340 if (busiest->group_imb) 6415 if (busiest->group_type == group_imbalanced)
6341 goto force_balance; 6416 goto force_balance;
6342 6417
6343 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6418 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6346 goto force_balance; 6421 goto force_balance;
6347 6422
6348 /* 6423 /*
6349 * If the local group is more busy than the selected busiest group 6424 * If the local group is busier than the selected busiest group
6350 * don't try and pull any tasks. 6425 * don't try and pull any tasks.
6351 */ 6426 */
6352 if (local->avg_load >= busiest->avg_load) 6427 if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6361 6436
6362 if (env->idle == CPU_IDLE) { 6437 if (env->idle == CPU_IDLE) {
6363 /* 6438 /*
6364 * This cpu is idle. If the busiest group load doesn't 6439 * This cpu is idle. If the busiest group is not overloaded
6365 * have more tasks than the number of available cpu's and 6440 * and there is no imbalance between this and busiest group
6366 * there is no imbalance between this and busiest group 6441 * wrt idle cpus, it is balanced. The imbalance becomes
6367 * wrt to idle cpu's, it is balanced. 6442 * significant if the diff is greater than 1 otherwise we
6443 * might end up to just move the imbalance on another group
6368 */ 6444 */
6369 if ((local->idle_cpus < busiest->idle_cpus) && 6445 if ((busiest->group_type != group_overloaded) &&
6370 busiest->sum_nr_running <= busiest->group_weight) 6446 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6371 goto out_balanced; 6447 goto out_balanced;
6372 } else { 6448 } else {
6373 /* 6449 /*
@@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6550 .loop_break = sched_nr_migrate_break, 6626 .loop_break = sched_nr_migrate_break,
6551 .cpus = cpus, 6627 .cpus = cpus,
6552 .fbq_type = all, 6628 .fbq_type = all,
6629 .tasks = LIST_HEAD_INIT(env.tasks),
6553 }; 6630 };
6554 6631
6555 /* 6632 /*
@@ -6599,23 +6676,30 @@ redo:
6599 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6676 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6600 6677
6601more_balance: 6678more_balance:
6602 local_irq_save(flags); 6679 raw_spin_lock_irqsave(&busiest->lock, flags);
6603 double_rq_lock(env.dst_rq, busiest);
6604 6680
6605 /* 6681 /*
6606 * cur_ld_moved - load moved in current iteration 6682 * cur_ld_moved - load moved in current iteration
6607 * ld_moved - cumulative load moved across iterations 6683 * ld_moved - cumulative load moved across iterations
6608 */ 6684 */
6609 cur_ld_moved = move_tasks(&env); 6685 cur_ld_moved = detach_tasks(&env);
6610 ld_moved += cur_ld_moved;
6611 double_rq_unlock(env.dst_rq, busiest);
6612 local_irq_restore(flags);
6613 6686
6614 /* 6687 /*
6615 * some other cpu did the load balance for us. 6688 * We've detached some tasks from busiest_rq. Every
6689 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6690 * unlock busiest->lock, and we are able to be sure
6691 * that nobody can manipulate the tasks in parallel.
6692 * See task_rq_lock() family for the details.
6616 */ 6693 */
6617 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 6694
6618 resched_cpu(env.dst_cpu); 6695 raw_spin_unlock(&busiest->lock);
6696
6697 if (cur_ld_moved) {
6698 attach_tasks(&env);
6699 ld_moved += cur_ld_moved;
6700 }
6701
6702 local_irq_restore(flags);
6619 6703
6620 if (env.flags & LBF_NEED_BREAK) { 6704 if (env.flags & LBF_NEED_BREAK) {
6621 env.flags &= ~LBF_NEED_BREAK; 6705 env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6749,8 @@ more_balance:
6665 if (sd_parent) { 6749 if (sd_parent) {
6666 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 6750 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6667 6751
6668 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6752 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6669 *group_imbalance = 1; 6753 *group_imbalance = 1;
6670 } else if (*group_imbalance)
6671 *group_imbalance = 0;
6672 } 6754 }
6673 6755
6674 /* All tasks on this runqueue were pinned by CPU affinity */ 6756 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6761,7 @@ more_balance:
6679 env.loop_break = sched_nr_migrate_break; 6761 env.loop_break = sched_nr_migrate_break;
6680 goto redo; 6762 goto redo;
6681 } 6763 }
6682 goto out_balanced; 6764 goto out_all_pinned;
6683 } 6765 }
6684 } 6766 }
6685 6767
@@ -6744,7 +6826,7 @@ more_balance:
6744 * If we've begun active balancing, start to back off. This 6826 * If we've begun active balancing, start to back off. This
6745 * case may not be covered by the all_pinned logic if there 6827 * case may not be covered by the all_pinned logic if there
6746 * is only 1 task on the busy runqueue (because we don't call 6828 * is only 1 task on the busy runqueue (because we don't call
6747 * move_tasks). 6829 * detach_tasks).
6748 */ 6830 */
6749 if (sd->balance_interval < sd->max_interval) 6831 if (sd->balance_interval < sd->max_interval)
6750 sd->balance_interval *= 2; 6832 sd->balance_interval *= 2;
@@ -6753,6 +6835,23 @@ more_balance:
6753 goto out; 6835 goto out;
6754 6836
6755out_balanced: 6837out_balanced:
6838 /*
6839 * We reach balance although we may have faced some affinity
6840 * constraints. Clear the imbalance flag if it was set.
6841 */
6842 if (sd_parent) {
6843 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6844
6845 if (*group_imbalance)
6846 *group_imbalance = 0;
6847 }
6848
6849out_all_pinned:
6850 /*
6851 * We reach balance because all tasks are pinned at this level so
6852 * we can't migrate them. Let the imbalance flag set so parent level
6853 * can try to migrate them.
6854 */
6756 schedstat_inc(sd, lb_balanced[idle]); 6855 schedstat_inc(sd, lb_balanced[idle]);
6757 6856
6758 sd->nr_balance_failed = 0; 6857 sd->nr_balance_failed = 0;
@@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data)
6914 int target_cpu = busiest_rq->push_cpu; 7013 int target_cpu = busiest_rq->push_cpu;
6915 struct rq *target_rq = cpu_rq(target_cpu); 7014 struct rq *target_rq = cpu_rq(target_cpu);
6916 struct sched_domain *sd; 7015 struct sched_domain *sd;
7016 struct task_struct *p = NULL;
6917 7017
6918 raw_spin_lock_irq(&busiest_rq->lock); 7018 raw_spin_lock_irq(&busiest_rq->lock);
6919 7019
@@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data)
6933 */ 7033 */
6934 BUG_ON(busiest_rq == target_rq); 7034 BUG_ON(busiest_rq == target_rq);
6935 7035
6936 /* move a task from busiest_rq to target_rq */
6937 double_lock_balance(busiest_rq, target_rq);
6938
6939 /* Search for an sd spanning us and the target CPU. */ 7036 /* Search for an sd spanning us and the target CPU. */
6940 rcu_read_lock(); 7037 rcu_read_lock();
6941 for_each_domain(target_cpu, sd) { 7038 for_each_domain(target_cpu, sd) {
@@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data)
6956 7053
6957 schedstat_inc(sd, alb_count); 7054 schedstat_inc(sd, alb_count);
6958 7055
6959 if (move_one_task(&env)) 7056 p = detach_one_task(&env);
7057 if (p)
6960 schedstat_inc(sd, alb_pushed); 7058 schedstat_inc(sd, alb_pushed);
6961 else 7059 else
6962 schedstat_inc(sd, alb_failed); 7060 schedstat_inc(sd, alb_failed);
6963 } 7061 }
6964 rcu_read_unlock(); 7062 rcu_read_unlock();
6965 double_unlock_balance(busiest_rq, target_rq);
6966out_unlock: 7063out_unlock:
6967 busiest_rq->active_balance = 0; 7064 busiest_rq->active_balance = 0;
6968 raw_spin_unlock_irq(&busiest_rq->lock); 7065 raw_spin_unlock(&busiest_rq->lock);
7066
7067 if (p)
7068 attach_one_task(target_rq, p);
7069
7070 local_irq_enable();
7071
6969 return 0; 7072 return 0;
6970} 7073}
6971 7074
@@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p)
7465static void 7568static void
7466prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 7569prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7467{ 7570{
7468 if (!p->se.on_rq) 7571 if (!task_on_rq_queued(p))
7469 return; 7572 return;
7470 7573
7471 /* 7574 /*
@@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7490 * switched back to the fair class the enqueue_entity(.flags=0) will 7593 * switched back to the fair class the enqueue_entity(.flags=0) will
7491 * do the right thing. 7594 * do the right thing.
7492 * 7595 *
7493 * If it's on_rq, then the dequeue_entity(.flags=0) will already 7596 * If it's queued, then the dequeue_entity(.flags=0) will already
7494 * have normalized the vruntime, if it's !on_rq, then only when 7597 * have normalized the vruntime, if it's !queued, then only when
7495 * the task is sleeping will it still have non-normalized vruntime. 7598 * the task is sleeping will it still have non-normalized vruntime.
7496 */ 7599 */
7497 if (!p->on_rq && p->state != TASK_RUNNING) { 7600 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
7498 /* 7601 /*
7499 * Fix up our vruntime so that the current sleep doesn't 7602 * Fix up our vruntime so that the current sleep doesn't
7500 * cause 'unlimited' sleep bonus. 7603 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7521 */ 7624 */
7522static void switched_to_fair(struct rq *rq, struct task_struct *p) 7625static void switched_to_fair(struct rq *rq, struct task_struct *p)
7523{ 7626{
7524 struct sched_entity *se = &p->se;
7525#ifdef CONFIG_FAIR_GROUP_SCHED 7627#ifdef CONFIG_FAIR_GROUP_SCHED
7628 struct sched_entity *se = &p->se;
7526 /* 7629 /*
7527 * Since the real-depth could have been changed (only FAIR 7630 * Since the real-depth could have been changed (only FAIR
7528 * class maintain depth value), reset depth properly. 7631 * class maintain depth value), reset depth properly.
7529 */ 7632 */
7530 se->depth = se->parent ? se->parent->depth + 1 : 0; 7633 se->depth = se->parent ? se->parent->depth + 1 : 0;
7531#endif 7634#endif
7532 if (!se->on_rq) 7635 if (!task_on_rq_queued(p))
7533 return; 7636 return;
7534 7637
7535 /* 7638 /*
@@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7575} 7678}
7576 7679
7577#ifdef CONFIG_FAIR_GROUP_SCHED 7680#ifdef CONFIG_FAIR_GROUP_SCHED
7578static void task_move_group_fair(struct task_struct *p, int on_rq) 7681static void task_move_group_fair(struct task_struct *p, int queued)
7579{ 7682{
7580 struct sched_entity *se = &p->se; 7683 struct sched_entity *se = &p->se;
7581 struct cfs_rq *cfs_rq; 7684 struct cfs_rq *cfs_rq;
@@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7594 * fair sleeper stuff for the first placement, but who cares. 7697 * fair sleeper stuff for the first placement, but who cares.
7595 */ 7698 */
7596 /* 7699 /*
7597 * When !on_rq, vruntime of the task has usually NOT been normalized. 7700 * When !queued, vruntime of the task has usually NOT been normalized.
7598 * But there are some cases where it has already been normalized: 7701 * But there are some cases where it has already been normalized:
7599 * 7702 *
7600 * - Moving a forked child which is waiting for being woken up by 7703 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7605 * To prevent boost or penalty in the new cfs_rq caused by delta 7708 * To prevent boost or penalty in the new cfs_rq caused by delta
7606 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7709 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7607 */ 7710 */
7608 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) 7711 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7609 on_rq = 1; 7712 queued = 1;
7610 7713
7611 if (!on_rq) 7714 if (!queued)
7612 se->vruntime -= cfs_rq_of(se)->min_vruntime; 7715 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7613 set_task_rq(p, task_cpu(p)); 7716 set_task_rq(p, task_cpu(p));
7614 se->depth = se->parent ? se->parent->depth + 1 : 0; 7717 se->depth = se->parent ? se->parent->depth + 1 : 0;
7615 if (!on_rq) { 7718 if (!queued) {
7616 cfs_rq = cfs_rq_of(se); 7719 cfs_rq = cfs_rq_of(se);
7617 se->vruntime += cfs_rq->min_vruntime; 7720 se->vruntime += cfs_rq->min_vruntime;
7618#ifdef CONFIG_SMP 7721#ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 /* Take note of the planned idle state. */
151 idle_set_state(this_rq(), &drv->states[next_state]);
152
150 /* 153 /*
151 * Enter the idle state previously returned by the governor decision. 154 * Enter the idle state previously returned by the governor decision.
152 * This function will block until an interrupt occurs and will take 155 * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
154 */ 157 */
155 entered_state = cpuidle_enter(drv, dev, next_state); 158 entered_state = cpuidle_enter(drv, dev, next_state);
156 159
160 /* The cpu is no longer idle or about to enter idle. */
161 idle_set_state(this_rq(), NULL);
162
157 if (broadcast) 163 if (broadcast)
158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 164 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
159 165
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..87ea5bf1b87f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1448 * means a dl or stop task can slip in, in which case we need 1448 * means a dl or stop task can slip in, in which case we need
1449 * to re-start task selection. 1449 * to re-start task selection.
1450 */ 1450 */
1451 if (unlikely((rq->stop && rq->stop->on_rq) || 1451 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1452 rq->dl.dl_nr_running)) 1452 rq->dl.dl_nr_running))
1453 return RETRY_TASK; 1453 return RETRY_TASK;
1454 } 1454 }
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1468 p = _pick_next_task_rt(rq); 1468 p = _pick_next_task_rt(rq);
1469 1469
1470 /* The running task is never eligible for pushing */ 1470 /* The running task is never eligible for pushing */
1471 if (p) 1471 dequeue_pushable_task(rq, p);
1472 dequeue_pushable_task(rq, p);
1473 1472
1474 set_post_schedule(rq); 1473 set_post_schedule(rq);
1475 1474
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1624 !cpumask_test_cpu(lowest_rq->cpu, 1623 !cpumask_test_cpu(lowest_rq->cpu,
1625 tsk_cpus_allowed(task)) || 1624 tsk_cpus_allowed(task)) ||
1626 task_running(rq, task) || 1625 task_running(rq, task) ||
1627 !task->on_rq)) { 1626 !task_on_rq_queued(task))) {
1628 1627
1629 double_unlock_balance(rq, lowest_rq); 1628 double_unlock_balance(rq, lowest_rq);
1630 lowest_rq = NULL; 1629 lowest_rq = NULL;
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1658 BUG_ON(task_current(rq, p)); 1657 BUG_ON(task_current(rq, p));
1659 BUG_ON(p->nr_cpus_allowed <= 1); 1658 BUG_ON(p->nr_cpus_allowed <= 1);
1660 1659
1661 BUG_ON(!p->on_rq); 1660 BUG_ON(!task_on_rq_queued(p));
1662 BUG_ON(!rt_task(p)); 1661 BUG_ON(!rt_task(p));
1663 1662
1664 return p; 1663 return p;
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq)
1809 */ 1808 */
1810 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1809 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1811 WARN_ON(p == src_rq->curr); 1810 WARN_ON(p == src_rq->curr);
1812 WARN_ON(!p->on_rq); 1811 WARN_ON(!task_on_rq_queued(p));
1813 1812
1814 /* 1813 /*
1815 * There's a chance that p is higher in priority 1814 * There's a chance that p is higher in priority
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1870 1869
1871 BUG_ON(!rt_task(p)); 1870 BUG_ON(!rt_task(p));
1872 1871
1873 if (!p->on_rq) 1872 if (!task_on_rq_queued(p))
1874 return; 1873 return;
1875 1874
1876 weight = cpumask_weight(new_mask); 1875 weight = cpumask_weight(new_mask);
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 * we may need to handle the pulling of RT tasks 1935 * we may need to handle the pulling of RT tasks
1937 * now. 1936 * now.
1938 */ 1937 */
1939 if (!p->on_rq || rq->rt.rt_nr_running) 1938 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
1940 return; 1939 return;
1941 1940
1942 if (pull_rt_task(rq)) 1941 if (pull_rt_task(rq))
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1970 * If that current running task is also an RT task 1969 * If that current running task is also an RT task
1971 * then see if we can move to another run queue. 1970 * then see if we can move to another run queue.
1972 */ 1971 */
1973 if (p->on_rq && rq->curr != p) { 1972 if (task_on_rq_queued(p) && rq->curr != p) {
1974#ifdef CONFIG_SMP 1973#ifdef CONFIG_SMP
1975 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 1974 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1976 /* Don't resched if we changed runqueues */ 1975 /* Don't resched if we changed runqueues */
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1989static void 1988static void
1990prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1989prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1991{ 1990{
1992 if (!p->on_rq) 1991 if (!task_on_rq_queued(p))
1993 return; 1992 return;
1994 1993
1995 if (rq->curr == p) { 1994 if (rq->curr == p) {
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2073 for_each_sched_rt_entity(rt_se) { 2072 for_each_sched_rt_entity(rt_se) {
2074 if (rt_se->run_list.prev != rt_se->run_list.next) { 2073 if (rt_se->run_list.prev != rt_se->run_list.next) {
2075 requeue_task_rt(rq, p, 0); 2074 requeue_task_rt(rq, p, 0);
2076 set_tsk_need_resched(p); 2075 resched_curr(rq);
2077 return; 2076 return;
2078 } 2077 }
2079 } 2078 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..6130251de280 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
14#include "cpuacct.h" 14#include "cpuacct.h"
15 15
16struct rq; 16struct rq;
17struct cpuidle_state;
18
19/* task_struct::on_rq states: */
20#define TASK_ON_RQ_QUEUED 1
21#define TASK_ON_RQ_MIGRATING 2
17 22
18extern __read_mostly int scheduler_running; 23extern __read_mostly int scheduler_running;
19 24
@@ -126,6 +131,9 @@ struct rt_bandwidth {
126 u64 rt_runtime; 131 u64 rt_runtime;
127 struct hrtimer rt_period_timer; 132 struct hrtimer rt_period_timer;
128}; 133};
134
135void __dl_clear_params(struct task_struct *p);
136
129/* 137/*
130 * To keep the bandwidth of -deadline tasks and groups under control 138 * To keep the bandwidth of -deadline tasks and groups under control
131 * we need some place where: 139 * we need some place where:
@@ -184,7 +192,7 @@ struct cfs_bandwidth {
184 raw_spinlock_t lock; 192 raw_spinlock_t lock;
185 ktime_t period; 193 ktime_t period;
186 u64 quota, runtime; 194 u64 quota, runtime;
187 s64 hierarchal_quota; 195 s64 hierarchical_quota;
188 u64 runtime_expires; 196 u64 runtime_expires;
189 197
190 int idle, timer_active; 198 int idle, timer_active;
@@ -636,6 +644,11 @@ struct rq {
636#ifdef CONFIG_SMP 644#ifdef CONFIG_SMP
637 struct llist_head wake_list; 645 struct llist_head wake_list;
638#endif 646#endif
647
648#ifdef CONFIG_CPU_IDLE
649 /* Must be inspected within a rcu lock section */
650 struct cpuidle_state *idle_state;
651#endif
639}; 652};
640 653
641static inline int cpu_of(struct rq *rq) 654static inline int cpu_of(struct rq *rq)
@@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq)
647#endif 660#endif
648} 661}
649 662
650DECLARE_PER_CPU(struct rq, runqueues); 663DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
651 664
652#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 665#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
653#define this_rq() (&__get_cpu_var(runqueues)) 666#define this_rq() (&__get_cpu_var(runqueues))
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
942#endif 955#endif
943} 956}
944 957
958static inline int task_on_rq_queued(struct task_struct *p)
959{
960 return p->on_rq == TASK_ON_RQ_QUEUED;
961}
962
963static inline int task_on_rq_migrating(struct task_struct *p)
964{
965 return p->on_rq == TASK_ON_RQ_MIGRATING;
966}
945 967
946#ifndef prepare_arch_switch 968#ifndef prepare_arch_switch
947# define prepare_arch_switch(next) do { } while (0) 969# define prepare_arch_switch(next) do { } while (0)
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
953# define finish_arch_post_lock_switch() do { } while (0) 975# define finish_arch_post_lock_switch() do { } while (0)
954#endif 976#endif
955 977
956#ifndef __ARCH_WANT_UNLOCKED_CTXSW
957static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 978static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
958{ 979{
959#ifdef CONFIG_SMP 980#ifdef CONFIG_SMP
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
991 raw_spin_unlock_irq(&rq->lock); 1012 raw_spin_unlock_irq(&rq->lock);
992} 1013}
993 1014
994#else /* __ARCH_WANT_UNLOCKED_CTXSW */
995static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
996{
997#ifdef CONFIG_SMP
998 /*
999 * We can optimise this out completely for !SMP, because the
1000 * SMP rebalancing from interrupt is the only thing that cares
1001 * here.
1002 */
1003 next->on_cpu = 1;
1004#endif
1005 raw_spin_unlock(&rq->lock);
1006}
1007
1008static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1009{
1010#ifdef CONFIG_SMP
1011 /*
1012 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1013 * We must ensure this doesn't happen until the switch is completely
1014 * finished.
1015 */
1016 smp_wmb();
1017 prev->on_cpu = 0;
1018#endif
1019 local_irq_enable();
1020}
1021#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1022
1023/* 1015/*
1024 * wake flags 1016 * wake flags
1025 */ 1017 */
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
1180 1172
1181#endif 1173#endif
1182 1174
1175#ifdef CONFIG_CPU_IDLE
1176static inline void idle_set_state(struct rq *rq,
1177 struct cpuidle_state *idle_state)
1178{
1179 rq->idle_state = idle_state;
1180}
1181
1182static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1183{
1184 WARN_ON(!rcu_read_lock_held());
1185 return rq->idle_state;
1186}
1187#else
1188static inline void idle_set_state(struct rq *rq,
1189 struct cpuidle_state *idle_state)
1190{
1191}
1192
1193static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1194{
1195 return NULL;
1196}
1197#endif
1198
1183extern void sysrq_sched_debug_show(void); 1199extern void sysrq_sched_debug_show(void);
1184extern void sched_init_granularity(void); 1200extern void sched_init_granularity(void);
1185extern void update_max_interval(void); 1201extern void update_max_interval(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..67426e529f59 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
28{ 28{
29 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
30 30
31 if (!stop || !stop->on_rq) 31 if (!stop || !task_on_rq_queued(stop))
32 return NULL; 32 return NULL;
33 33
34 put_prev_task(rq, prev); 34 put_prev_task(rq, prev);
diff --git a/kernel/smp.c b/kernel/smp.c
index aff8aa14f547..9e0d0b289118 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/sched.h>
16 17
17#include "smpboot.h" 18#include "smpboot.h"
18 19
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
699 smp_call_function(do_nothing, NULL, 1); 700 smp_call_function(do_nothing, NULL, 1);
700} 701}
701EXPORT_SYMBOL_GPL(kick_all_cpus_sync); 702EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
703
704/**
705 * wake_up_all_idle_cpus - break all cpus out of idle
706 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
707 * including idle polling cpus, for non-idle cpus, we will do nothing
708 * for them.
709 */
710void wake_up_all_idle_cpus(void)
711{
712 int cpu;
713
714 preempt_disable();
715 for_each_online_cpu(cpu) {
716 if (cpu == smp_processor_id())
717 continue;
718
719 wake_up_if_idle(cpu);
720 }
721 preempt_enable();
722}
723EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/sys.c b/kernel/sys.c
index dfce4debd138..1eaa2f0b0246 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms)
869{ 869{
870 cputime_t tgutime, tgstime, cutime, cstime; 870 cputime_t tgutime, tgstime, cutime, cstime;
871 871
872 spin_lock_irq(&current->sighand->siglock);
873 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 872 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
874 cutime = current->signal->cutime; 873 cutime = current->signal->cutime;
875 cstime = current->signal->cstime; 874 cstime = current->signal->cstime;
876 spin_unlock_irq(&current->sighand->siglock);
877 tms->tms_utime = cputime_to_clock_t(tgutime); 875 tms->tms_utime = cputime_to_clock_t(tgutime);
878 tms->tms_stime = cputime_to_clock_t(tgstime); 876 tms->tms_stime = cputime_to_clock_t(tgstime);
879 tms->tms_cutime = cputime_to_clock_t(cutime); 877 tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..ab370ffffd53 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1776 */ 1776 */
1777 if (!expires) { 1777 if (!expires) {
1778 schedule(); 1778 schedule();
1779 __set_current_state(TASK_RUNNING);
1780 return -EINTR; 1779 return -EINTR;
1781 } 1780 }
1782 1781
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..492b986195d5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
272 if (same_thread_group(tsk, current)) 272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn); 273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else { 274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk)) 275 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn); 276 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 } 277 }
292 278
293 if (!err) 279 if (!err)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
205 break; 205 break;
206 206
207 schedule(); 207 schedule();
208 __set_current_state(TASK_RUNNING);
209 } 208 }
210 reader_finish = 0; 209 reader_finish = 0;
211 complete(&read_done); 210 complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
379 break; 378 break;
380 379
381 schedule(); 380 schedule();
382 __set_current_state(TASK_RUNNING);
383 } 381 }
384 __set_current_state(TASK_RUNNING); 382 __set_current_state(TASK_RUNNING);
385 383
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
407 trace_printk("Sleeping for 10 secs\n"); 405 trace_printk("Sleeping for 10 secs\n");
408 set_current_state(TASK_INTERRUPTIBLE); 406 set_current_state(TASK_INTERRUPTIBLE);
409 schedule_timeout(HZ * SLEEP_TIME); 407 schedule_timeout(HZ * SLEEP_TIME);
410 __set_current_state(TASK_RUNNING);
411 } 408 }
412 409
413 if (kill_test) 410 if (kill_test)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
17 16
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
171 i++; 170 i++;
172 } 171 }
173 172
174 if ((current != &init_task && 173 if (task_stack_end_corrupted(current)) {
175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack(); 174 print_max_stack();
177 BUG(); 175 BUG();
178 } 176 }