diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-13 10:23:15 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-13 10:23:15 -0400 |
| commit | faafcba3b5e15999cf75d5c5a513ac8e47e2545f (patch) | |
| tree | 47d58d1c00e650e820506c91eb9a41268756bdda /kernel | |
| parent | 13ead805c5a14b0e7ecd34f61404a5bfba655895 (diff) | |
| parent | f10e00f4bf360c36edbe6bf18a6c75b171cbe012 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- Optimized support for Intel "Cluster-on-Die" (CoD) topologies (Dave
Hansen)
- Various sched/idle refinements for better idle handling (Nicolas
Pitre, Daniel Lezcano, Chuansheng Liu, Vincent Guittot)
- sched/numa updates and optimizations (Rik van Riel)
- sysbench speedup (Vincent Guittot)
- capacity calculation cleanups/refactoring (Vincent Guittot)
- Various cleanups to thread group iteration (Oleg Nesterov)
- Double-rq-lock removal optimization and various refactorings
(Kirill Tkhai)
- various sched/deadline fixes
... and lots of other changes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits)
sched/dl: Use dl_bw_of() under rcu_read_lock_sched()
sched/fair: Delete resched_cpu() from idle_balance()
sched, time: Fix build error with 64 bit cputime_t on 32 bit systems
sched: Improve sysbench performance by fixing spurious active migration
sched/x86: Fix up typo in topology detection
x86, sched: Add new topology for multi-NUMA-node CPUs
sched/rt: Use resched_curr() in task_tick_rt()
sched: Use rq->rd in sched_setaffinity() under RCU read lock
sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask'
sched: Use dl_bw_of() under RCU read lock
sched/fair: Remove duplicate code from can_migrate_task()
sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW
sched: print_rq(): Don't use tasklist_lock
sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock()
sched: Fix the task-group check in tg_has_rt_tasks()
sched/fair: Leverage the idle state info when choosing the "idlest" cpu
sched: Let the scheduler see CPU idle states
sched/deadline: Fix inter- exclusive cpusets migrations
sched/deadline: Clear dl_entity params when setscheduling to different class
sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault()
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/exit.c | 47 | ||||
| -rw-r--r-- | kernel/fork.c | 13 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 5 | ||||
| -rw-r--r-- | kernel/sched/core.c | 295 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 4 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 64 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 13 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 479 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 6 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 21 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 80 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
| -rw-r--r-- | kernel/smp.c | 22 | ||||
| -rw-r--r-- | kernel/sys.c | 2 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 1 | ||||
| -rw-r--r-- | kernel/time/posix-cpu-timers.c | 14 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 3 | ||||
| -rw-r--r-- | kernel/trace/trace_stack.c | 4 |
19 files changed, 667 insertions, 441 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index d13f2eec4bb8..5d30019ff953 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 115 | 115 | ||
| 116 | if (tsk == sig->curr_target) | 116 | if (tsk == sig->curr_target) |
| 117 | sig->curr_target = next_thread(tsk); | 117 | sig->curr_target = next_thread(tsk); |
| 118 | /* | ||
| 119 | * Accumulate here the counters for all threads but the | ||
| 120 | * group leader as they die, so they can be added into | ||
| 121 | * the process-wide totals when those are taken. | ||
| 122 | * The group leader stays around as a zombie as long | ||
| 123 | * as there are other threads. When it gets reaped, | ||
| 124 | * the exit.c code will add its counts into these totals. | ||
| 125 | * We won't ever get here for the group leader, since it | ||
| 126 | * will have been the last reference on the signal_struct. | ||
| 127 | */ | ||
| 128 | task_cputime(tsk, &utime, &stime); | ||
| 129 | sig->utime += utime; | ||
| 130 | sig->stime += stime; | ||
| 131 | sig->gtime += task_gtime(tsk); | ||
| 132 | sig->min_flt += tsk->min_flt; | ||
| 133 | sig->maj_flt += tsk->maj_flt; | ||
| 134 | sig->nvcsw += tsk->nvcsw; | ||
| 135 | sig->nivcsw += tsk->nivcsw; | ||
| 136 | sig->inblock += task_io_get_inblock(tsk); | ||
| 137 | sig->oublock += task_io_get_oublock(tsk); | ||
| 138 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 139 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 140 | } | 118 | } |
| 141 | 119 | ||
| 120 | /* | ||
| 121 | * Accumulate here the counters for all threads but the group leader | ||
| 122 | * as they die, so they can be added into the process-wide totals | ||
| 123 | * when those are taken. The group leader stays around as a zombie as | ||
| 124 | * long as there are other threads. When it gets reaped, the exit.c | ||
| 125 | * code will add its counts into these totals. We won't ever get here | ||
| 126 | * for the group leader, since it will have been the last reference on | ||
| 127 | * the signal_struct. | ||
| 128 | */ | ||
| 129 | task_cputime(tsk, &utime, &stime); | ||
| 130 | write_seqlock(&sig->stats_lock); | ||
| 131 | sig->utime += utime; | ||
| 132 | sig->stime += stime; | ||
| 133 | sig->gtime += task_gtime(tsk); | ||
| 134 | sig->min_flt += tsk->min_flt; | ||
| 135 | sig->maj_flt += tsk->maj_flt; | ||
| 136 | sig->nvcsw += tsk->nvcsw; | ||
| 137 | sig->nivcsw += tsk->nivcsw; | ||
| 138 | sig->inblock += task_io_get_inblock(tsk); | ||
| 139 | sig->oublock += task_io_get_oublock(tsk); | ||
| 140 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 141 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 142 | sig->nr_threads--; | 142 | sig->nr_threads--; |
| 143 | __unhash_process(tsk, group_dead); | 143 | __unhash_process(tsk, group_dead); |
| 144 | write_sequnlock(&sig->stats_lock); | ||
| 144 | 145 | ||
| 145 | /* | 146 | /* |
| 146 | * Do this under ->siglock, we can race with another thread | 147 | * Do this under ->siglock, we can race with another thread |
| @@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1046 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1047 | spin_lock_irq(&p->real_parent->sighand->siglock); |
| 1047 | psig = p->real_parent->signal; | 1048 | psig = p->real_parent->signal; |
| 1048 | sig = p->signal; | 1049 | sig = p->signal; |
| 1050 | write_seqlock(&psig->stats_lock); | ||
| 1049 | psig->cutime += tgutime + sig->cutime; | 1051 | psig->cutime += tgutime + sig->cutime; |
| 1050 | psig->cstime += tgstime + sig->cstime; | 1052 | psig->cstime += tgstime + sig->cstime; |
| 1051 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | 1053 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
| @@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1068 | psig->cmaxrss = maxrss; | 1070 | psig->cmaxrss = maxrss; |
| 1069 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1071 | task_io_accounting_add(&psig->ioac, &p->ioac); |
| 1070 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1072 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
| 1073 | write_sequnlock(&psig->stats_lock); | ||
| 1071 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1074 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
| 1072 | } | 1075 | } |
| 1073 | 1076 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8c162d102740..9b7d746d6d62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, | |||
| 294 | return 0; | 294 | return 0; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | void set_task_stack_end_magic(struct task_struct *tsk) | ||
| 298 | { | ||
| 299 | unsigned long *stackend; | ||
| 300 | |||
| 301 | stackend = end_of_stack(tsk); | ||
| 302 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 303 | } | ||
| 304 | |||
| 297 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 305 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
| 298 | { | 306 | { |
| 299 | struct task_struct *tsk; | 307 | struct task_struct *tsk; |
| 300 | struct thread_info *ti; | 308 | struct thread_info *ti; |
| 301 | unsigned long *stackend; | ||
| 302 | int node = tsk_fork_get_node(orig); | 309 | int node = tsk_fork_get_node(orig); |
| 303 | int err; | 310 | int err; |
| 304 | 311 | ||
| @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 328 | setup_thread_stack(tsk, orig); | 335 | setup_thread_stack(tsk, orig); |
| 329 | clear_user_return_notifier(tsk); | 336 | clear_user_return_notifier(tsk); |
| 330 | clear_tsk_need_resched(tsk); | 337 | clear_tsk_need_resched(tsk); |
| 331 | stackend = end_of_stack(tsk); | 338 | set_task_stack_end_magic(tsk); |
| 332 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 333 | 339 | ||
| 334 | #ifdef CONFIG_CC_STACKPROTECTOR | 340 | #ifdef CONFIG_CC_STACKPROTECTOR |
| 335 | tsk->stack_canary = get_random_int(); | 341 | tsk->stack_canary = get_random_int(); |
| @@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1067 | sig->curr_target = tsk; | 1073 | sig->curr_target = tsk; |
| 1068 | init_sigpending(&sig->shared_pending); | 1074 | init_sigpending(&sig->shared_pending); |
| 1069 | INIT_LIST_HEAD(&sig->posix_timers); | 1075 | INIT_LIST_HEAD(&sig->posix_timers); |
| 1076 | seqlock_init(&sig->stats_lock); | ||
| 1070 | 1077 | ||
| 1071 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1078 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1072 | sig->real_timer.function = it_real_fn; | 1079 | sig->real_timer.function = it_real_fn; |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
| @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
| 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) |
| 149 | goto out; | 149 | goto out; |
| 150 | 150 | ||
| 151 | t = p; | 151 | for_each_thread(p, t) |
| 152 | do { | ||
| 153 | sched_move_task(t); | 152 | sched_move_task(t); |
| 154 | } while_each_thread(p, t); | ||
| 155 | |||
| 156 | out: | 153 | out: |
| 157 | unlock_task_sighand(p, &flags); | 154 | unlock_task_sighand(p, &flags); |
| 158 | autogroup_kref_put(prev); | 155 | autogroup_kref_put(prev); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f235c41a3532..44999505e1bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
| 317 | for (;;) { | 317 | for (;;) { |
| 318 | rq = task_rq(p); | 318 | rq = task_rq(p); |
| 319 | raw_spin_lock(&rq->lock); | 319 | raw_spin_lock(&rq->lock); |
| 320 | if (likely(rq == task_rq(p))) | 320 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 321 | return rq; | 321 | return rq; |
| 322 | raw_spin_unlock(&rq->lock); | 322 | raw_spin_unlock(&rq->lock); |
| 323 | |||
| 324 | while (unlikely(task_on_rq_migrating(p))) | ||
| 325 | cpu_relax(); | ||
| 323 | } | 326 | } |
| 324 | } | 327 | } |
| 325 | 328 | ||
| @@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 336 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | 339 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
| 337 | rq = task_rq(p); | 340 | rq = task_rq(p); |
| 338 | raw_spin_lock(&rq->lock); | 341 | raw_spin_lock(&rq->lock); |
| 339 | if (likely(rq == task_rq(p))) | 342 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 340 | return rq; | 343 | return rq; |
| 341 | raw_spin_unlock(&rq->lock); | 344 | raw_spin_unlock(&rq->lock); |
| 342 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 345 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
| 346 | |||
| 347 | while (unlikely(task_on_rq_migrating(p))) | ||
| 348 | cpu_relax(); | ||
| 343 | } | 349 | } |
| 344 | } | 350 | } |
| 345 | 351 | ||
| @@ -433,7 +439,15 @@ static void __hrtick_start(void *arg) | |||
| 433 | void hrtick_start(struct rq *rq, u64 delay) | 439 | void hrtick_start(struct rq *rq, u64 delay) |
| 434 | { | 440 | { |
| 435 | struct hrtimer *timer = &rq->hrtick_timer; | 441 | struct hrtimer *timer = &rq->hrtick_timer; |
| 436 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 442 | ktime_t time; |
| 443 | s64 delta; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * Don't schedule slices shorter than 10000ns, that just | ||
| 447 | * doesn't make sense and can cause timer DoS. | ||
| 448 | */ | ||
| 449 | delta = max_t(s64, delay, 10000LL); | ||
| 450 | time = ktime_add_ns(timer->base->get_time(), delta); | ||
| 437 | 451 | ||
| 438 | hrtimer_set_expires(timer, time); | 452 | hrtimer_set_expires(timer, time); |
| 439 | 453 | ||
| @@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 1027 | * A queue event has occurred, and we're going to schedule. In | 1041 | * A queue event has occurred, and we're going to schedule. In |
| 1028 | * this case, we can save a useless back to back clock update. | 1042 | * this case, we can save a useless back to back clock update. |
| 1029 | */ | 1043 | */ |
| 1030 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | 1044 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
| 1031 | rq->skip_clock_update = 1; | 1045 | rq->skip_clock_update = 1; |
| 1032 | } | 1046 | } |
| 1033 | 1047 | ||
| @@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1072 | 1086 | ||
| 1073 | static void __migrate_swap_task(struct task_struct *p, int cpu) | 1087 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
| 1074 | { | 1088 | { |
| 1075 | if (p->on_rq) { | 1089 | if (task_on_rq_queued(p)) { |
| 1076 | struct rq *src_rq, *dst_rq; | 1090 | struct rq *src_rq, *dst_rq; |
| 1077 | 1091 | ||
| 1078 | src_rq = task_rq(p); | 1092 | src_rq = task_rq(p); |
| @@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data); | |||
| 1198 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) | 1212 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
| 1199 | { | 1213 | { |
| 1200 | unsigned long flags; | 1214 | unsigned long flags; |
| 1201 | int running, on_rq; | 1215 | int running, queued; |
| 1202 | unsigned long ncsw; | 1216 | unsigned long ncsw; |
| 1203 | struct rq *rq; | 1217 | struct rq *rq; |
| 1204 | 1218 | ||
| @@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1236 | rq = task_rq_lock(p, &flags); | 1250 | rq = task_rq_lock(p, &flags); |
| 1237 | trace_sched_wait_task(p); | 1251 | trace_sched_wait_task(p); |
| 1238 | running = task_running(rq, p); | 1252 | running = task_running(rq, p); |
| 1239 | on_rq = p->on_rq; | 1253 | queued = task_on_rq_queued(p); |
| 1240 | ncsw = 0; | 1254 | ncsw = 0; |
| 1241 | if (!match_state || p->state == match_state) | 1255 | if (!match_state || p->state == match_state) |
| 1242 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 1256 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| @@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1268 | * running right now), it's preempted, and we should | 1282 | * running right now), it's preempted, and we should |
| 1269 | * yield - it could be a while. | 1283 | * yield - it could be a while. |
| 1270 | */ | 1284 | */ |
| 1271 | if (unlikely(on_rq)) { | 1285 | if (unlikely(queued)) { |
| 1272 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); | 1286 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
| 1273 | 1287 | ||
| 1274 | set_current_state(TASK_UNINTERRUPTIBLE); | 1288 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
| 1462 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1476 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
| 1463 | { | 1477 | { |
| 1464 | activate_task(rq, p, en_flags); | 1478 | activate_task(rq, p, en_flags); |
| 1465 | p->on_rq = 1; | 1479 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 1466 | 1480 | ||
| 1467 | /* if a worker is waking up, notify workqueue */ | 1481 | /* if a worker is waking up, notify workqueue */ |
| 1468 | if (p->flags & PF_WQ_WORKER) | 1482 | if (p->flags & PF_WQ_WORKER) |
| @@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
| 1521 | int ret = 0; | 1535 | int ret = 0; |
| 1522 | 1536 | ||
| 1523 | rq = __task_rq_lock(p); | 1537 | rq = __task_rq_lock(p); |
| 1524 | if (p->on_rq) { | 1538 | if (task_on_rq_queued(p)) { |
| 1525 | /* check_preempt_curr() may use rq clock */ | 1539 | /* check_preempt_curr() may use rq clock */ |
| 1526 | update_rq_clock(rq); | 1540 | update_rq_clock(rq); |
| 1527 | ttwu_do_wakeup(rq, p, wake_flags); | 1541 | ttwu_do_wakeup(rq, p, wake_flags); |
| @@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1604 | } | 1618 | } |
| 1605 | } | 1619 | } |
| 1606 | 1620 | ||
| 1621 | void wake_up_if_idle(int cpu) | ||
| 1622 | { | ||
| 1623 | struct rq *rq = cpu_rq(cpu); | ||
| 1624 | unsigned long flags; | ||
| 1625 | |||
| 1626 | if (!is_idle_task(rq->curr)) | ||
| 1627 | return; | ||
| 1628 | |||
| 1629 | if (set_nr_if_polling(rq->idle)) { | ||
| 1630 | trace_sched_wake_idle_without_ipi(cpu); | ||
| 1631 | } else { | ||
| 1632 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 1633 | if (is_idle_task(rq->curr)) | ||
| 1634 | smp_send_reschedule(cpu); | ||
| 1635 | /* Else cpu is not in idle, do nothing here */ | ||
| 1636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1637 | } | ||
| 1638 | } | ||
| 1639 | |||
| 1607 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1640 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1608 | { | 1641 | { |
| 1609 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1642 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
| 1726 | if (!(p->state & TASK_NORMAL)) | 1759 | if (!(p->state & TASK_NORMAL)) |
| 1727 | goto out; | 1760 | goto out; |
| 1728 | 1761 | ||
| 1729 | if (!p->on_rq) | 1762 | if (!task_on_rq_queued(p)) |
| 1730 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 1763 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 1731 | 1764 | ||
| 1732 | ttwu_do_wakeup(rq, p, 0); | 1765 | ttwu_do_wakeup(rq, p, 0); |
| @@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1760 | } | 1793 | } |
| 1761 | 1794 | ||
| 1762 | /* | 1795 | /* |
| 1796 | * This function clears the sched_dl_entity static params. | ||
| 1797 | */ | ||
| 1798 | void __dl_clear_params(struct task_struct *p) | ||
| 1799 | { | ||
| 1800 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 1801 | |||
| 1802 | dl_se->dl_runtime = 0; | ||
| 1803 | dl_se->dl_deadline = 0; | ||
| 1804 | dl_se->dl_period = 0; | ||
| 1805 | dl_se->flags = 0; | ||
| 1806 | dl_se->dl_bw = 0; | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | /* | ||
| 1763 | * Perform scheduler related setup for a newly forked process p. | 1810 | * Perform scheduler related setup for a newly forked process p. |
| 1764 | * p is forked by current. | 1811 | * p is forked by current. |
| 1765 | * | 1812 | * |
| @@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1783 | 1830 | ||
| 1784 | RB_CLEAR_NODE(&p->dl.rb_node); | 1831 | RB_CLEAR_NODE(&p->dl.rb_node); |
| 1785 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1832 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1786 | p->dl.dl_runtime = p->dl.runtime = 0; | 1833 | __dl_clear_params(p); |
| 1787 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
| 1788 | p->dl.dl_period = 0; | ||
| 1789 | p->dl.flags = 0; | ||
| 1790 | 1834 | ||
| 1791 | INIT_LIST_HEAD(&p->rt.run_list); | 1835 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1792 | 1836 | ||
| @@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
| 1961 | #ifdef CONFIG_SMP | 2005 | #ifdef CONFIG_SMP |
| 1962 | inline struct dl_bw *dl_bw_of(int i) | 2006 | inline struct dl_bw *dl_bw_of(int i) |
| 1963 | { | 2007 | { |
| 2008 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2009 | "sched RCU must be held"); | ||
| 1964 | return &cpu_rq(i)->rd->dl_bw; | 2010 | return &cpu_rq(i)->rd->dl_bw; |
| 1965 | } | 2011 | } |
| 1966 | 2012 | ||
| @@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i) | |||
| 1969 | struct root_domain *rd = cpu_rq(i)->rd; | 2015 | struct root_domain *rd = cpu_rq(i)->rd; |
| 1970 | int cpus = 0; | 2016 | int cpus = 0; |
| 1971 | 2017 | ||
| 2018 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2019 | "sched RCU must be held"); | ||
| 1972 | for_each_cpu_and(i, rd->span, cpu_active_mask) | 2020 | for_each_cpu_and(i, rd->span, cpu_active_mask) |
| 1973 | cpus++; | 2021 | cpus++; |
| 1974 | 2022 | ||
| @@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2079 | init_task_runnable_average(p); | 2127 | init_task_runnable_average(p); |
| 2080 | rq = __task_rq_lock(p); | 2128 | rq = __task_rq_lock(p); |
| 2081 | activate_task(rq, p, 0); | 2129 | activate_task(rq, p, 0); |
| 2082 | p->on_rq = 1; | 2130 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 2083 | trace_sched_wakeup_new(p, true); | 2131 | trace_sched_wakeup_new(p, true); |
| 2084 | check_preempt_curr(rq, p, WF_FORK); | 2132 | check_preempt_curr(rq, p, WF_FORK); |
| 2085 | #ifdef CONFIG_SMP | 2133 | #ifdef CONFIG_SMP |
| @@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
| 2271 | */ | 2319 | */ |
| 2272 | post_schedule(rq); | 2320 | post_schedule(rq); |
| 2273 | 2321 | ||
| 2274 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2275 | /* In this case, finish_task_switch does not reenable preemption */ | ||
| 2276 | preempt_enable(); | ||
| 2277 | #endif | ||
| 2278 | if (current->set_child_tid) | 2322 | if (current->set_child_tid) |
| 2279 | put_user(task_pid_vnr(current), current->set_child_tid); | 2323 | put_user(task_pid_vnr(current), current->set_child_tid); |
| 2280 | } | 2324 | } |
| @@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2317 | * of the scheduler it's an obvious special-case), so we | 2361 | * of the scheduler it's an obvious special-case), so we |
| 2318 | * do an early lockdep release here: | 2362 | * do an early lockdep release here: |
| 2319 | */ | 2363 | */ |
| 2320 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2321 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2364 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
| 2322 | #endif | ||
| 2323 | 2365 | ||
| 2324 | context_tracking_task_switch(prev, next); | 2366 | context_tracking_task_switch(prev, next); |
| 2325 | /* Here we just switch the register state and the stack. */ | 2367 | /* Here we just switch the register state and the stack. */ |
| @@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
| 2447 | * project cycles that may never be accounted to this | 2489 | * project cycles that may never be accounted to this |
| 2448 | * thread, breaking clock_gettime(). | 2490 | * thread, breaking clock_gettime(). |
| 2449 | */ | 2491 | */ |
| 2450 | if (task_current(rq, p) && p->on_rq) { | 2492 | if (task_current(rq, p) && task_on_rq_queued(p)) { |
| 2451 | update_rq_clock(rq); | 2493 | update_rq_clock(rq); |
| 2452 | ns = rq_clock_task(rq) - p->se.exec_start; | 2494 | ns = rq_clock_task(rq) - p->se.exec_start; |
| 2453 | if ((s64)ns < 0) | 2495 | if ((s64)ns < 0) |
| @@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2493 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 2535 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
| 2494 | * been accounted, so we're correct here as well. | 2536 | * been accounted, so we're correct here as well. |
| 2495 | */ | 2537 | */ |
| 2496 | if (!p->on_cpu || !p->on_rq) | 2538 | if (!p->on_cpu || !task_on_rq_queued(p)) |
| 2497 | return p->se.sum_exec_runtime; | 2539 | return p->se.sum_exec_runtime; |
| 2498 | #endif | 2540 | #endif |
| 2499 | 2541 | ||
| @@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2656 | */ | 2698 | */ |
| 2657 | static inline void schedule_debug(struct task_struct *prev) | 2699 | static inline void schedule_debug(struct task_struct *prev) |
| 2658 | { | 2700 | { |
| 2701 | #ifdef CONFIG_SCHED_STACK_END_CHECK | ||
| 2702 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | ||
| 2703 | #endif | ||
| 2659 | /* | 2704 | /* |
| 2660 | * Test if we are atomic. Since do_exit() needs to call into | 2705 | * Test if we are atomic. Since do_exit() needs to call into |
| 2661 | * schedule() atomically, we ignore that path. Otherwise whine | 2706 | * schedule() atomically, we ignore that path. Otherwise whine |
| @@ -2797,7 +2842,7 @@ need_resched: | |||
| 2797 | switch_count = &prev->nvcsw; | 2842 | switch_count = &prev->nvcsw; |
| 2798 | } | 2843 | } |
| 2799 | 2844 | ||
| 2800 | if (prev->on_rq || rq->skip_clock_update < 0) | 2845 | if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) |
| 2801 | update_rq_clock(rq); | 2846 | update_rq_clock(rq); |
| 2802 | 2847 | ||
| 2803 | next = pick_next_task(rq, prev); | 2848 | next = pick_next_task(rq, prev); |
| @@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 2962 | */ | 3007 | */ |
| 2963 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3008 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 2964 | { | 3009 | { |
| 2965 | int oldprio, on_rq, running, enqueue_flag = 0; | 3010 | int oldprio, queued, running, enqueue_flag = 0; |
| 2966 | struct rq *rq; | 3011 | struct rq *rq; |
| 2967 | const struct sched_class *prev_class; | 3012 | const struct sched_class *prev_class; |
| 2968 | 3013 | ||
| @@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 2991 | trace_sched_pi_setprio(p, prio); | 3036 | trace_sched_pi_setprio(p, prio); |
| 2992 | oldprio = p->prio; | 3037 | oldprio = p->prio; |
| 2993 | prev_class = p->sched_class; | 3038 | prev_class = p->sched_class; |
| 2994 | on_rq = p->on_rq; | 3039 | queued = task_on_rq_queued(p); |
| 2995 | running = task_current(rq, p); | 3040 | running = task_current(rq, p); |
| 2996 | if (on_rq) | 3041 | if (queued) |
| 2997 | dequeue_task(rq, p, 0); | 3042 | dequeue_task(rq, p, 0); |
| 2998 | if (running) | 3043 | if (running) |
| 2999 | p->sched_class->put_prev_task(rq, p); | 3044 | put_prev_task(rq, p); |
| 3000 | 3045 | ||
| 3001 | /* | 3046 | /* |
| 3002 | * Boosting condition are: | 3047 | * Boosting condition are: |
| @@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3033 | 3078 | ||
| 3034 | if (running) | 3079 | if (running) |
| 3035 | p->sched_class->set_curr_task(rq); | 3080 | p->sched_class->set_curr_task(rq); |
| 3036 | if (on_rq) | 3081 | if (queued) |
| 3037 | enqueue_task(rq, p, enqueue_flag); | 3082 | enqueue_task(rq, p, enqueue_flag); |
| 3038 | 3083 | ||
| 3039 | check_class_changed(rq, p, prev_class, oldprio); | 3084 | check_class_changed(rq, p, prev_class, oldprio); |
| @@ -3044,7 +3089,7 @@ out_unlock: | |||
| 3044 | 3089 | ||
| 3045 | void set_user_nice(struct task_struct *p, long nice) | 3090 | void set_user_nice(struct task_struct *p, long nice) |
| 3046 | { | 3091 | { |
| 3047 | int old_prio, delta, on_rq; | 3092 | int old_prio, delta, queued; |
| 3048 | unsigned long flags; | 3093 | unsigned long flags; |
| 3049 | struct rq *rq; | 3094 | struct rq *rq; |
| 3050 | 3095 | ||
| @@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3065 | p->static_prio = NICE_TO_PRIO(nice); | 3110 | p->static_prio = NICE_TO_PRIO(nice); |
| 3066 | goto out_unlock; | 3111 | goto out_unlock; |
| 3067 | } | 3112 | } |
| 3068 | on_rq = p->on_rq; | 3113 | queued = task_on_rq_queued(p); |
| 3069 | if (on_rq) | 3114 | if (queued) |
| 3070 | dequeue_task(rq, p, 0); | 3115 | dequeue_task(rq, p, 0); |
| 3071 | 3116 | ||
| 3072 | p->static_prio = NICE_TO_PRIO(nice); | 3117 | p->static_prio = NICE_TO_PRIO(nice); |
| @@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3075 | p->prio = effective_prio(p); | 3120 | p->prio = effective_prio(p); |
| 3076 | delta = p->prio - old_prio; | 3121 | delta = p->prio - old_prio; |
| 3077 | 3122 | ||
| 3078 | if (on_rq) { | 3123 | if (queued) { |
| 3079 | enqueue_task(rq, p, 0); | 3124 | enqueue_task(rq, p, 0); |
| 3080 | /* | 3125 | /* |
| 3081 | * If the task increased its priority or is running and | 3126 | * If the task increased its priority or is running and |
| @@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 3347 | { | 3392 | { |
| 3348 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : | 3393 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : |
| 3349 | MAX_RT_PRIO - 1 - attr->sched_priority; | 3394 | MAX_RT_PRIO - 1 - attr->sched_priority; |
| 3350 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3395 | int retval, oldprio, oldpolicy = -1, queued, running; |
| 3351 | int policy = attr->sched_policy; | 3396 | int policy = attr->sched_policy; |
| 3352 | unsigned long flags; | 3397 | unsigned long flags; |
| 3353 | const struct sched_class *prev_class; | 3398 | const struct sched_class *prev_class; |
| @@ -3544,19 +3589,19 @@ change: | |||
| 3544 | return 0; | 3589 | return 0; |
| 3545 | } | 3590 | } |
| 3546 | 3591 | ||
| 3547 | on_rq = p->on_rq; | 3592 | queued = task_on_rq_queued(p); |
| 3548 | running = task_current(rq, p); | 3593 | running = task_current(rq, p); |
| 3549 | if (on_rq) | 3594 | if (queued) |
| 3550 | dequeue_task(rq, p, 0); | 3595 | dequeue_task(rq, p, 0); |
| 3551 | if (running) | 3596 | if (running) |
| 3552 | p->sched_class->put_prev_task(rq, p); | 3597 | put_prev_task(rq, p); |
| 3553 | 3598 | ||
| 3554 | prev_class = p->sched_class; | 3599 | prev_class = p->sched_class; |
| 3555 | __setscheduler(rq, p, attr); | 3600 | __setscheduler(rq, p, attr); |
| 3556 | 3601 | ||
| 3557 | if (running) | 3602 | if (running) |
| 3558 | p->sched_class->set_curr_task(rq); | 3603 | p->sched_class->set_curr_task(rq); |
| 3559 | if (on_rq) { | 3604 | if (queued) { |
| 3560 | /* | 3605 | /* |
| 3561 | * We enqueue to tail when the priority of a task is | 3606 | * We enqueue to tail when the priority of a task is |
| 3562 | * increased (user space view). | 3607 | * increased (user space view). |
| @@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3980 | rcu_read_lock(); | 4025 | rcu_read_lock(); |
| 3981 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | 4026 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { |
| 3982 | rcu_read_unlock(); | 4027 | rcu_read_unlock(); |
| 3983 | goto out_unlock; | 4028 | goto out_free_new_mask; |
| 3984 | } | 4029 | } |
| 3985 | rcu_read_unlock(); | 4030 | rcu_read_unlock(); |
| 3986 | } | 4031 | } |
| 3987 | 4032 | ||
| 3988 | retval = security_task_setscheduler(p); | 4033 | retval = security_task_setscheduler(p); |
| 3989 | if (retval) | 4034 | if (retval) |
| 3990 | goto out_unlock; | 4035 | goto out_free_new_mask; |
| 3991 | 4036 | ||
| 3992 | 4037 | ||
| 3993 | cpuset_cpus_allowed(p, cpus_allowed); | 4038 | cpuset_cpus_allowed(p, cpus_allowed); |
| @@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 4000 | * root_domain. | 4045 | * root_domain. |
| 4001 | */ | 4046 | */ |
| 4002 | #ifdef CONFIG_SMP | 4047 | #ifdef CONFIG_SMP |
| 4003 | if (task_has_dl_policy(p)) { | 4048 | if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { |
| 4004 | const struct cpumask *span = task_rq(p)->rd->span; | 4049 | rcu_read_lock(); |
| 4005 | 4050 | if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { | |
| 4006 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
| 4007 | retval = -EBUSY; | 4051 | retval = -EBUSY; |
| 4008 | goto out_unlock; | 4052 | rcu_read_unlock(); |
| 4053 | goto out_free_new_mask; | ||
| 4009 | } | 4054 | } |
| 4055 | rcu_read_unlock(); | ||
| 4010 | } | 4056 | } |
| 4011 | #endif | 4057 | #endif |
| 4012 | again: | 4058 | again: |
| @@ -4024,7 +4070,7 @@ again: | |||
| 4024 | goto again; | 4070 | goto again; |
| 4025 | } | 4071 | } |
| 4026 | } | 4072 | } |
| 4027 | out_unlock: | 4073 | out_free_new_mask: |
| 4028 | free_cpumask_var(new_mask); | 4074 | free_cpumask_var(new_mask); |
| 4029 | out_free_cpus_allowed: | 4075 | out_free_cpus_allowed: |
| 4030 | free_cpumask_var(cpus_allowed); | 4076 | free_cpumask_var(cpus_allowed); |
| @@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4508 | " task PC stack pid father\n"); | 4554 | " task PC stack pid father\n"); |
| 4509 | #endif | 4555 | #endif |
| 4510 | rcu_read_lock(); | 4556 | rcu_read_lock(); |
| 4511 | do_each_thread(g, p) { | 4557 | for_each_process_thread(g, p) { |
| 4512 | /* | 4558 | /* |
| 4513 | * reset the NMI-timeout, listing all files on a slow | 4559 | * reset the NMI-timeout, listing all files on a slow |
| 4514 | * console might take a lot of time: | 4560 | * console might take a lot of time: |
| @@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4516 | touch_nmi_watchdog(); | 4562 | touch_nmi_watchdog(); |
| 4517 | if (!state_filter || (p->state & state_filter)) | 4563 | if (!state_filter || (p->state & state_filter)) |
| 4518 | sched_show_task(p); | 4564 | sched_show_task(p); |
| 4519 | } while_each_thread(g, p); | 4565 | } |
| 4520 | 4566 | ||
| 4521 | touch_all_softlockup_watchdogs(); | 4567 | touch_all_softlockup_watchdogs(); |
| 4522 | 4568 | ||
| @@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4571 | rcu_read_unlock(); | 4617 | rcu_read_unlock(); |
| 4572 | 4618 | ||
| 4573 | rq->curr = rq->idle = idle; | 4619 | rq->curr = rq->idle = idle; |
| 4574 | idle->on_rq = 1; | 4620 | idle->on_rq = TASK_ON_RQ_QUEUED; |
| 4575 | #if defined(CONFIG_SMP) | 4621 | #if defined(CONFIG_SMP) |
| 4576 | idle->on_cpu = 1; | 4622 | idle->on_cpu = 1; |
| 4577 | #endif | 4623 | #endif |
| @@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4592 | } | 4638 | } |
| 4593 | 4639 | ||
| 4594 | #ifdef CONFIG_SMP | 4640 | #ifdef CONFIG_SMP |
| 4641 | /* | ||
| 4642 | * move_queued_task - move a queued task to new rq. | ||
| 4643 | * | ||
| 4644 | * Returns (locked) new rq. Old rq's lock is released. | ||
| 4645 | */ | ||
| 4646 | static struct rq *move_queued_task(struct task_struct *p, int new_cpu) | ||
| 4647 | { | ||
| 4648 | struct rq *rq = task_rq(p); | ||
| 4649 | |||
| 4650 | lockdep_assert_held(&rq->lock); | ||
| 4651 | |||
| 4652 | dequeue_task(rq, p, 0); | ||
| 4653 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 4654 | set_task_cpu(p, new_cpu); | ||
| 4655 | raw_spin_unlock(&rq->lock); | ||
| 4656 | |||
| 4657 | rq = cpu_rq(new_cpu); | ||
| 4658 | |||
| 4659 | raw_spin_lock(&rq->lock); | ||
| 4660 | BUG_ON(task_cpu(p) != new_cpu); | ||
| 4661 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 4662 | enqueue_task(rq, p, 0); | ||
| 4663 | check_preempt_curr(rq, p, 0); | ||
| 4664 | |||
| 4665 | return rq; | ||
| 4666 | } | ||
| 4667 | |||
| 4595 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4668 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
| 4596 | { | 4669 | { |
| 4597 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4670 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
| @@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 4648 | goto out; | 4721 | goto out; |
| 4649 | 4722 | ||
| 4650 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 4723 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
| 4651 | if (p->on_rq) { | 4724 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
| 4652 | struct migration_arg arg = { p, dest_cpu }; | 4725 | struct migration_arg arg = { p, dest_cpu }; |
| 4653 | /* Need help from migration thread: drop lock and wait. */ | 4726 | /* Need help from migration thread: drop lock and wait. */ |
| 4654 | task_rq_unlock(rq, p, &flags); | 4727 | task_rq_unlock(rq, p, &flags); |
| 4655 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 4728 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
| 4656 | tlb_migrate_finish(p->mm); | 4729 | tlb_migrate_finish(p->mm); |
| 4657 | return 0; | 4730 | return 0; |
| 4658 | } | 4731 | } else if (task_on_rq_queued(p)) |
| 4732 | rq = move_queued_task(p, dest_cpu); | ||
| 4659 | out: | 4733 | out: |
| 4660 | task_rq_unlock(rq, p, &flags); | 4734 | task_rq_unlock(rq, p, &flags); |
| 4661 | 4735 | ||
| @@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
| 4676 | */ | 4750 | */ |
| 4677 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4751 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4678 | { | 4752 | { |
| 4679 | struct rq *rq_dest, *rq_src; | 4753 | struct rq *rq; |
| 4680 | int ret = 0; | 4754 | int ret = 0; |
| 4681 | 4755 | ||
| 4682 | if (unlikely(!cpu_active(dest_cpu))) | 4756 | if (unlikely(!cpu_active(dest_cpu))) |
| 4683 | return ret; | 4757 | return ret; |
| 4684 | 4758 | ||
| 4685 | rq_src = cpu_rq(src_cpu); | 4759 | rq = cpu_rq(src_cpu); |
| 4686 | rq_dest = cpu_rq(dest_cpu); | ||
| 4687 | 4760 | ||
| 4688 | raw_spin_lock(&p->pi_lock); | 4761 | raw_spin_lock(&p->pi_lock); |
| 4689 | double_rq_lock(rq_src, rq_dest); | 4762 | raw_spin_lock(&rq->lock); |
| 4690 | /* Already moved. */ | 4763 | /* Already moved. */ |
| 4691 | if (task_cpu(p) != src_cpu) | 4764 | if (task_cpu(p) != src_cpu) |
| 4692 | goto done; | 4765 | goto done; |
| 4766 | |||
| 4693 | /* Affinity changed (again). */ | 4767 | /* Affinity changed (again). */ |
| 4694 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 4768 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
| 4695 | goto fail; | 4769 | goto fail; |
| @@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4698 | * If we're not on a rq, the next wake-up will ensure we're | 4772 | * If we're not on a rq, the next wake-up will ensure we're |
| 4699 | * placed properly. | 4773 | * placed properly. |
| 4700 | */ | 4774 | */ |
| 4701 | if (p->on_rq) { | 4775 | if (task_on_rq_queued(p)) |
| 4702 | dequeue_task(rq_src, p, 0); | 4776 | rq = move_queued_task(p, dest_cpu); |
| 4703 | set_task_cpu(p, dest_cpu); | ||
| 4704 | enqueue_task(rq_dest, p, 0); | ||
| 4705 | check_preempt_curr(rq_dest, p, 0); | ||
| 4706 | } | ||
| 4707 | done: | 4777 | done: |
| 4708 | ret = 1; | 4778 | ret = 1; |
| 4709 | fail: | 4779 | fail: |
| 4710 | double_rq_unlock(rq_src, rq_dest); | 4780 | raw_spin_unlock(&rq->lock); |
| 4711 | raw_spin_unlock(&p->pi_lock); | 4781 | raw_spin_unlock(&p->pi_lock); |
| 4712 | return ret; | 4782 | return ret; |
| 4713 | } | 4783 | } |
| @@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 4739 | { | 4809 | { |
| 4740 | struct rq *rq; | 4810 | struct rq *rq; |
| 4741 | unsigned long flags; | 4811 | unsigned long flags; |
| 4742 | bool on_rq, running; | 4812 | bool queued, running; |
| 4743 | 4813 | ||
| 4744 | rq = task_rq_lock(p, &flags); | 4814 | rq = task_rq_lock(p, &flags); |
| 4745 | on_rq = p->on_rq; | 4815 | queued = task_on_rq_queued(p); |
| 4746 | running = task_current(rq, p); | 4816 | running = task_current(rq, p); |
| 4747 | 4817 | ||
| 4748 | if (on_rq) | 4818 | if (queued) |
| 4749 | dequeue_task(rq, p, 0); | 4819 | dequeue_task(rq, p, 0); |
| 4750 | if (running) | 4820 | if (running) |
| 4751 | p->sched_class->put_prev_task(rq, p); | 4821 | put_prev_task(rq, p); |
| 4752 | 4822 | ||
| 4753 | p->numa_preferred_nid = nid; | 4823 | p->numa_preferred_nid = nid; |
| 4754 | 4824 | ||
| 4755 | if (running) | 4825 | if (running) |
| 4756 | p->sched_class->set_curr_task(rq); | 4826 | p->sched_class->set_curr_task(rq); |
| 4757 | if (on_rq) | 4827 | if (queued) |
| 4758 | enqueue_task(rq, p, 0); | 4828 | enqueue_task(rq, p, 0); |
| 4759 | task_rq_unlock(rq, p, &flags); | 4829 | task_rq_unlock(rq, p, &flags); |
| 4760 | } | 4830 | } |
| @@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data) | |||
| 4774 | * be on another cpu but it doesn't matter. | 4844 | * be on another cpu but it doesn't matter. |
| 4775 | */ | 4845 | */ |
| 4776 | local_irq_disable(); | 4846 | local_irq_disable(); |
| 4847 | /* | ||
| 4848 | * We need to explicitly wake pending tasks before running | ||
| 4849 | * __migrate_task() such that we will not miss enforcing cpus_allowed | ||
| 4850 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | ||
| 4851 | */ | ||
| 4852 | sched_ttwu_pending(); | ||
| 4777 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | 4853 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); |
| 4778 | local_irq_enable(); | 4854 | local_irq_enable(); |
| 4779 | return 0; | 4855 | return 0; |
| @@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5184 | { | 5260 | { |
| 5185 | unsigned long flags; | 5261 | unsigned long flags; |
| 5186 | long cpu = (long)hcpu; | 5262 | long cpu = (long)hcpu; |
| 5263 | struct dl_bw *dl_b; | ||
| 5187 | 5264 | ||
| 5188 | switch (action & ~CPU_TASKS_FROZEN) { | 5265 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5189 | case CPU_DOWN_PREPARE: | 5266 | case CPU_DOWN_PREPARE: |
| @@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5191 | 5268 | ||
| 5192 | /* explicitly allow suspend */ | 5269 | /* explicitly allow suspend */ |
| 5193 | if (!(action & CPU_TASKS_FROZEN)) { | 5270 | if (!(action & CPU_TASKS_FROZEN)) { |
| 5194 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 5195 | bool overflow; | 5271 | bool overflow; |
| 5196 | int cpus; | 5272 | int cpus; |
| 5197 | 5273 | ||
| 5274 | rcu_read_lock_sched(); | ||
| 5275 | dl_b = dl_bw_of(cpu); | ||
| 5276 | |||
| 5198 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 5277 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 5199 | cpus = dl_bw_cpus(cpu); | 5278 | cpus = dl_bw_cpus(cpu); |
| 5200 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 5279 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
| 5201 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 5280 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 5202 | 5281 | ||
| 5282 | rcu_read_unlock_sched(); | ||
| 5283 | |||
| 5203 | if (overflow) | 5284 | if (overflow) |
| 5204 | return notifier_from_errno(-EBUSY); | 5285 | return notifier_from_errno(-EBUSY); |
| 5205 | } | 5286 | } |
| @@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5742 | const struct cpumask *span = sched_domain_span(sd); | 5823 | const struct cpumask *span = sched_domain_span(sd); |
| 5743 | struct cpumask *covered = sched_domains_tmpmask; | 5824 | struct cpumask *covered = sched_domains_tmpmask; |
| 5744 | struct sd_data *sdd = sd->private; | 5825 | struct sd_data *sdd = sd->private; |
| 5745 | struct sched_domain *child; | 5826 | struct sched_domain *sibling; |
| 5746 | int i; | 5827 | int i; |
| 5747 | 5828 | ||
| 5748 | cpumask_clear(covered); | 5829 | cpumask_clear(covered); |
| @@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5753 | if (cpumask_test_cpu(i, covered)) | 5834 | if (cpumask_test_cpu(i, covered)) |
| 5754 | continue; | 5835 | continue; |
| 5755 | 5836 | ||
| 5756 | child = *per_cpu_ptr(sdd->sd, i); | 5837 | sibling = *per_cpu_ptr(sdd->sd, i); |
| 5757 | 5838 | ||
| 5758 | /* See the comment near build_group_mask(). */ | 5839 | /* See the comment near build_group_mask(). */ |
| 5759 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | 5840 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) |
| 5760 | continue; | 5841 | continue; |
| 5761 | 5842 | ||
| 5762 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5843 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| @@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5766 | goto fail; | 5847 | goto fail; |
| 5767 | 5848 | ||
| 5768 | sg_span = sched_group_cpus(sg); | 5849 | sg_span = sched_group_cpus(sg); |
| 5769 | if (child->child) { | 5850 | if (sibling->child) |
| 5770 | child = child->child; | 5851 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); |
| 5771 | cpumask_copy(sg_span, sched_domain_span(child)); | 5852 | else |
| 5772 | } else | ||
| 5773 | cpumask_set_cpu(i, sg_span); | 5853 | cpumask_set_cpu(i, sg_span); |
| 5774 | 5854 | ||
| 5775 | cpumask_or(covered, covered, sg_span); | 5855 | cpumask_or(covered, covered, sg_span); |
| @@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
| 7120 | .sched_policy = SCHED_NORMAL, | 7200 | .sched_policy = SCHED_NORMAL, |
| 7121 | }; | 7201 | }; |
| 7122 | int old_prio = p->prio; | 7202 | int old_prio = p->prio; |
| 7123 | int on_rq; | 7203 | int queued; |
| 7124 | 7204 | ||
| 7125 | on_rq = p->on_rq; | 7205 | queued = task_on_rq_queued(p); |
| 7126 | if (on_rq) | 7206 | if (queued) |
| 7127 | dequeue_task(rq, p, 0); | 7207 | dequeue_task(rq, p, 0); |
| 7128 | __setscheduler(rq, p, &attr); | 7208 | __setscheduler(rq, p, &attr); |
| 7129 | if (on_rq) { | 7209 | if (queued) { |
| 7130 | enqueue_task(rq, p, 0); | 7210 | enqueue_task(rq, p, 0); |
| 7131 | resched_curr(rq); | 7211 | resched_curr(rq); |
| 7132 | } | 7212 | } |
| @@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void) | |||
| 7140 | unsigned long flags; | 7220 | unsigned long flags; |
| 7141 | struct rq *rq; | 7221 | struct rq *rq; |
| 7142 | 7222 | ||
| 7143 | read_lock_irqsave(&tasklist_lock, flags); | 7223 | read_lock(&tasklist_lock); |
| 7144 | do_each_thread(g, p) { | 7224 | for_each_process_thread(g, p) { |
| 7145 | /* | 7225 | /* |
| 7146 | * Only normalize user tasks: | 7226 | * Only normalize user tasks: |
| 7147 | */ | 7227 | */ |
| 7148 | if (!p->mm) | 7228 | if (p->flags & PF_KTHREAD) |
| 7149 | continue; | 7229 | continue; |
| 7150 | 7230 | ||
| 7151 | p->se.exec_start = 0; | 7231 | p->se.exec_start = 0; |
| @@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void) | |||
| 7160 | * Renice negative nice level userspace | 7240 | * Renice negative nice level userspace |
| 7161 | * tasks back to 0: | 7241 | * tasks back to 0: |
| 7162 | */ | 7242 | */ |
| 7163 | if (task_nice(p) < 0 && p->mm) | 7243 | if (task_nice(p) < 0) |
| 7164 | set_user_nice(p, 0); | 7244 | set_user_nice(p, 0); |
| 7165 | continue; | 7245 | continue; |
| 7166 | } | 7246 | } |
| 7167 | 7247 | ||
| 7168 | raw_spin_lock(&p->pi_lock); | 7248 | rq = task_rq_lock(p, &flags); |
| 7169 | rq = __task_rq_lock(p); | ||
| 7170 | |||
| 7171 | normalize_task(rq, p); | 7249 | normalize_task(rq, p); |
| 7172 | 7250 | task_rq_unlock(rq, p, &flags); | |
| 7173 | __task_rq_unlock(rq); | 7251 | } |
| 7174 | raw_spin_unlock(&p->pi_lock); | 7252 | read_unlock(&tasklist_lock); |
| 7175 | } while_each_thread(g, p); | ||
| 7176 | |||
| 7177 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 7178 | } | 7253 | } |
| 7179 | 7254 | ||
| 7180 | #endif /* CONFIG_MAGIC_SYSRQ */ | 7255 | #endif /* CONFIG_MAGIC_SYSRQ */ |
| @@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg) | |||
| 7314 | void sched_move_task(struct task_struct *tsk) | 7389 | void sched_move_task(struct task_struct *tsk) |
| 7315 | { | 7390 | { |
| 7316 | struct task_group *tg; | 7391 | struct task_group *tg; |
| 7317 | int on_rq, running; | 7392 | int queued, running; |
| 7318 | unsigned long flags; | 7393 | unsigned long flags; |
| 7319 | struct rq *rq; | 7394 | struct rq *rq; |
| 7320 | 7395 | ||
| 7321 | rq = task_rq_lock(tsk, &flags); | 7396 | rq = task_rq_lock(tsk, &flags); |
| 7322 | 7397 | ||
| 7323 | running = task_current(rq, tsk); | 7398 | running = task_current(rq, tsk); |
| 7324 | on_rq = tsk->on_rq; | 7399 | queued = task_on_rq_queued(tsk); |
| 7325 | 7400 | ||
| 7326 | if (on_rq) | 7401 | if (queued) |
| 7327 | dequeue_task(rq, tsk, 0); | 7402 | dequeue_task(rq, tsk, 0); |
| 7328 | if (unlikely(running)) | 7403 | if (unlikely(running)) |
| 7329 | tsk->sched_class->put_prev_task(rq, tsk); | 7404 | put_prev_task(rq, tsk); |
| 7330 | 7405 | ||
| 7331 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, | 7406 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, |
| 7332 | lockdep_is_held(&tsk->sighand->siglock)), | 7407 | lockdep_is_held(&tsk->sighand->siglock)), |
| @@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7336 | 7411 | ||
| 7337 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7412 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7338 | if (tsk->sched_class->task_move_group) | 7413 | if (tsk->sched_class->task_move_group) |
| 7339 | tsk->sched_class->task_move_group(tsk, on_rq); | 7414 | tsk->sched_class->task_move_group(tsk, queued); |
| 7340 | else | 7415 | else |
| 7341 | #endif | 7416 | #endif |
| 7342 | set_task_rq(tsk, task_cpu(tsk)); | 7417 | set_task_rq(tsk, task_cpu(tsk)); |
| 7343 | 7418 | ||
| 7344 | if (unlikely(running)) | 7419 | if (unlikely(running)) |
| 7345 | tsk->sched_class->set_curr_task(rq); | 7420 | tsk->sched_class->set_curr_task(rq); |
| 7346 | if (on_rq) | 7421 | if (queued) |
| 7347 | enqueue_task(rq, tsk, 0); | 7422 | enqueue_task(rq, tsk, 0); |
| 7348 | 7423 | ||
| 7349 | task_rq_unlock(rq, tsk, &flags); | 7424 | task_rq_unlock(rq, tsk, &flags); |
| @@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 7361 | { | 7436 | { |
| 7362 | struct task_struct *g, *p; | 7437 | struct task_struct *g, *p; |
| 7363 | 7438 | ||
| 7364 | do_each_thread(g, p) { | 7439 | for_each_process_thread(g, p) { |
| 7365 | if (rt_task(p) && task_rq(p)->rt.tg == tg) | 7440 | if (rt_task(p) && task_group(p) == tg) |
| 7366 | return 1; | 7441 | return 1; |
| 7367 | } while_each_thread(g, p); | 7442 | } |
| 7368 | 7443 | ||
| 7369 | return 0; | 7444 | return 0; |
| 7370 | } | 7445 | } |
| @@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void) | |||
| 7573 | u64 runtime = global_rt_runtime(); | 7648 | u64 runtime = global_rt_runtime(); |
| 7574 | u64 period = global_rt_period(); | 7649 | u64 period = global_rt_period(); |
| 7575 | u64 new_bw = to_ratio(period, runtime); | 7650 | u64 new_bw = to_ratio(period, runtime); |
| 7651 | struct dl_bw *dl_b; | ||
| 7576 | int cpu, ret = 0; | 7652 | int cpu, ret = 0; |
| 7577 | unsigned long flags; | 7653 | unsigned long flags; |
| 7578 | 7654 | ||
| @@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void) | |||
| 7586 | * solutions is welcome! | 7662 | * solutions is welcome! |
| 7587 | */ | 7663 | */ |
| 7588 | for_each_possible_cpu(cpu) { | 7664 | for_each_possible_cpu(cpu) { |
| 7589 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7665 | rcu_read_lock_sched(); |
| 7666 | dl_b = dl_bw_of(cpu); | ||
| 7590 | 7667 | ||
| 7591 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7668 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7592 | if (new_bw < dl_b->total_bw) | 7669 | if (new_bw < dl_b->total_bw) |
| 7593 | ret = -EBUSY; | 7670 | ret = -EBUSY; |
| 7594 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7671 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7595 | 7672 | ||
| 7673 | rcu_read_unlock_sched(); | ||
| 7674 | |||
| 7596 | if (ret) | 7675 | if (ret) |
| 7597 | break; | 7676 | break; |
| 7598 | } | 7677 | } |
| @@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void) | |||
| 7603 | static void sched_dl_do_global(void) | 7682 | static void sched_dl_do_global(void) |
| 7604 | { | 7683 | { |
| 7605 | u64 new_bw = -1; | 7684 | u64 new_bw = -1; |
| 7685 | struct dl_bw *dl_b; | ||
| 7606 | int cpu; | 7686 | int cpu; |
| 7607 | unsigned long flags; | 7687 | unsigned long flags; |
| 7608 | 7688 | ||
| @@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void) | |||
| 7616 | * FIXME: As above... | 7696 | * FIXME: As above... |
| 7617 | */ | 7697 | */ |
| 7618 | for_each_possible_cpu(cpu) { | 7698 | for_each_possible_cpu(cpu) { |
| 7619 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7699 | rcu_read_lock_sched(); |
| 7700 | dl_b = dl_bw_of(cpu); | ||
| 7620 | 7701 | ||
| 7621 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7702 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7622 | dl_b->bw = new_bw; | 7703 | dl_b->bw = new_bw; |
| 7623 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7704 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7705 | |||
| 7706 | rcu_read_unlock_sched(); | ||
| 7624 | } | 7707 | } |
| 7625 | } | 7708 | } |
| 7626 | 7709 | ||
| @@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 8001 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; | 8084 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
| 8002 | 8085 | ||
| 8003 | quota = normalize_cfs_quota(tg, d); | 8086 | quota = normalize_cfs_quota(tg, d); |
| 8004 | parent_quota = parent_b->hierarchal_quota; | 8087 | parent_quota = parent_b->hierarchical_quota; |
| 8005 | 8088 | ||
| 8006 | /* | 8089 | /* |
| 8007 | * ensure max(child_quota) <= parent_quota, inherit when no | 8090 | * ensure max(child_quota) <= parent_quota, inherit when no |
| @@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 8012 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | 8095 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) |
| 8013 | return -EINVAL; | 8096 | return -EINVAL; |
| 8014 | } | 8097 | } |
| 8015 | cfs_b->hierarchal_quota = quota; | 8098 | cfs_b->hierarchical_quota = quota; |
| 8016 | 8099 | ||
| 8017 | return 0; | 8100 | return 0; |
| 8018 | } | 8101 | } |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 107 | int best_cpu = -1; | 107 | int best_cpu = -1; |
| 108 | const struct sched_dl_entity *dl_se = &p->dl; | 108 | const struct sched_dl_entity *dl_se = &p->dl; |
| 109 | 109 | ||
| 110 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | 110 | if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { |
| 111 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
| 112 | later_mask, cpu_active_mask)) { | ||
| 113 | best_cpu = cpumask_any(later_mask); | 111 | best_cpu = cpumask_any(later_mask); |
| 114 | goto out; | 112 | goto out; |
| 115 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | 113 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 288 | struct signal_struct *sig = tsk->signal; | 288 | struct signal_struct *sig = tsk->signal; |
| 289 | cputime_t utime, stime; | 289 | cputime_t utime, stime; |
| 290 | struct task_struct *t; | 290 | struct task_struct *t; |
| 291 | 291 | unsigned int seq, nextseq; | |
| 292 | times->utime = sig->utime; | 292 | unsigned long flags; |
| 293 | times->stime = sig->stime; | ||
| 294 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
| 295 | 293 | ||
| 296 | rcu_read_lock(); | 294 | rcu_read_lock(); |
| 297 | /* make sure we can trust tsk->thread_group list */ | 295 | /* Attempt a lockless read on the first round. */ |
| 298 | if (!likely(pid_alive(tsk))) | 296 | nextseq = 0; |
| 299 | goto out; | ||
| 300 | |||
| 301 | t = tsk; | ||
| 302 | do { | 297 | do { |
| 303 | task_cputime(t, &utime, &stime); | 298 | seq = nextseq; |
| 304 | times->utime += utime; | 299 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
| 305 | times->stime += stime; | 300 | times->utime = sig->utime; |
| 306 | times->sum_exec_runtime += task_sched_runtime(t); | 301 | times->stime = sig->stime; |
| 307 | } while_each_thread(tsk, t); | 302 | times->sum_exec_runtime = sig->sum_sched_runtime; |
| 308 | out: | 303 | |
| 304 | for_each_thread(tsk, t) { | ||
| 305 | task_cputime(t, &utime, &stime); | ||
| 306 | times->utime += utime; | ||
| 307 | times->stime += stime; | ||
| 308 | times->sum_exec_runtime += task_sched_runtime(t); | ||
| 309 | } | ||
| 310 | /* If lockless access failed, take the lock. */ | ||
| 311 | nextseq = 1; | ||
| 312 | } while (need_seqretry(&sig->stats_lock, seq)); | ||
| 313 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); | ||
| 309 | rcu_read_unlock(); | 314 | rcu_read_unlock(); |
| 310 | } | 315 | } |
| 311 | 316 | ||
| @@ -550,6 +555,23 @@ drop_precision: | |||
| 550 | } | 555 | } |
| 551 | 556 | ||
| 552 | /* | 557 | /* |
| 558 | * Atomically advance counter to the new value. Interrupts, vcpu | ||
| 559 | * scheduling, and scaling inaccuracies can cause cputime_advance | ||
| 560 | * to be occasionally called with a new value smaller than counter. | ||
| 561 | * Let's enforce atomicity. | ||
| 562 | * | ||
| 563 | * Normally a caller will only go through this loop once, or not | ||
| 564 | * at all in case a previous caller updated counter the same jiffy. | ||
| 565 | */ | ||
| 566 | static void cputime_advance(cputime_t *counter, cputime_t new) | ||
| 567 | { | ||
| 568 | cputime_t old; | ||
| 569 | |||
| 570 | while (new > (old = ACCESS_ONCE(*counter))) | ||
| 571 | cmpxchg_cputime(counter, old, new); | ||
| 572 | } | ||
| 573 | |||
| 574 | /* | ||
| 553 | * Adjust tick based cputime random precision against scheduler | 575 | * Adjust tick based cputime random precision against scheduler |
| 554 | * runtime accounting. | 576 | * runtime accounting. |
| 555 | */ | 577 | */ |
| @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 594 | utime = rtime - stime; | 616 | utime = rtime - stime; |
| 595 | } | 617 | } |
| 596 | 618 | ||
| 597 | /* | 619 | cputime_advance(&prev->stime, stime); |
| 598 | * If the tick based count grows faster than the scheduler one, | 620 | cputime_advance(&prev->utime, utime); |
| 599 | * the result of the scaling may go backward. | ||
| 600 | * Let's enforce monotonicity. | ||
| 601 | */ | ||
| 602 | prev->stime = max(prev->stime, stime); | ||
| 603 | prev->utime = max(prev->utime, utime); | ||
| 604 | 621 | ||
| 605 | out: | 622 | out: |
| 606 | *ut = prev->utime; | 623 | *ut = prev->utime; |
| @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 617 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 634 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
| 618 | } | 635 | } |
| 619 | 636 | ||
| 620 | /* | ||
| 621 | * Must be called with siglock held. | ||
| 622 | */ | ||
| 623 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 637 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 624 | { | 638 | { |
| 625 | struct task_cputime cputime; | 639 | struct task_cputime cputime; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..abfaf3d9a29f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -530,7 +530,7 @@ again: | |||
| 530 | update_rq_clock(rq); | 530 | update_rq_clock(rq); |
| 531 | dl_se->dl_throttled = 0; | 531 | dl_se->dl_throttled = 0; |
| 532 | dl_se->dl_yielded = 0; | 532 | dl_se->dl_yielded = 0; |
| 533 | if (p->on_rq) { | 533 | if (task_on_rq_queued(p)) { |
| 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
| 535 | if (task_has_dl_policy(rq->curr)) | 535 | if (task_has_dl_policy(rq->curr)) |
| 536 | check_preempt_curr_dl(rq, p, 0); | 536 | check_preempt_curr_dl(rq, p, 0); |
| @@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
| 997 | #ifdef CONFIG_SCHED_HRTICK | 997 | #ifdef CONFIG_SCHED_HRTICK |
| 998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | 998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) |
| 999 | { | 999 | { |
| 1000 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | 1000 | hrtick_start(rq, p->dl.runtime); |
| 1001 | |||
| 1002 | if (delta > 10000) | ||
| 1003 | hrtick_start(rq, p->dl.runtime); | ||
| 1004 | } | 1001 | } |
| 1005 | #endif | 1002 | #endif |
| 1006 | 1003 | ||
| @@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1030 | * means a stop task can slip in, in which case we need to | 1027 | * means a stop task can slip in, in which case we need to |
| 1031 | * re-start task selection. | 1028 | * re-start task selection. |
| 1032 | */ | 1029 | */ |
| 1033 | if (rq->stop && rq->stop->on_rq) | 1030 | if (rq->stop && task_on_rq_queued(rq->stop)) |
| 1034 | return RETRY_TASK; | 1031 | return RETRY_TASK; |
| 1035 | } | 1032 | } |
| 1036 | 1033 | ||
| @@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq) | |||
| 1124 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1121 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1125 | { | 1122 | { |
| 1126 | if (!task_running(rq, p) && | 1123 | if (!task_running(rq, p) && |
| 1127 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1124 | cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 1128 | (p->nr_cpus_allowed > 1)) | ||
| 1129 | return 1; | 1125 | return 1; |
| 1130 | |||
| 1131 | return 0; | 1126 | return 0; |
| 1132 | } | 1127 | } |
| 1133 | 1128 | ||
| @@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task) | |||
| 1169 | if (task->nr_cpus_allowed == 1) | 1164 | if (task->nr_cpus_allowed == 1) |
| 1170 | return -1; | 1165 | return -1; |
| 1171 | 1166 | ||
| 1167 | /* | ||
| 1168 | * We have to consider system topology and task affinity | ||
| 1169 | * first, then we can look for a suitable cpu. | ||
| 1170 | */ | ||
| 1171 | cpumask_copy(later_mask, task_rq(task)->rd->span); | ||
| 1172 | cpumask_and(later_mask, later_mask, cpu_active_mask); | ||
| 1173 | cpumask_and(later_mask, later_mask, &task->cpus_allowed); | ||
| 1172 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | 1174 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, |
| 1173 | task, later_mask); | 1175 | task, later_mask); |
| 1174 | if (best_cpu == -1) | 1176 | if (best_cpu == -1) |
| @@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
| 1257 | if (unlikely(task_rq(task) != rq || | 1259 | if (unlikely(task_rq(task) != rq || |
| 1258 | !cpumask_test_cpu(later_rq->cpu, | 1260 | !cpumask_test_cpu(later_rq->cpu, |
| 1259 | &task->cpus_allowed) || | 1261 | &task->cpus_allowed) || |
| 1260 | task_running(rq, task) || !task->on_rq)) { | 1262 | task_running(rq, task) || |
| 1263 | !task_on_rq_queued(task))) { | ||
| 1261 | double_unlock_balance(rq, later_rq); | 1264 | double_unlock_balance(rq, later_rq); |
| 1262 | later_rq = NULL; | 1265 | later_rq = NULL; |
| 1263 | break; | 1266 | break; |
| @@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | |||
| 1296 | BUG_ON(task_current(rq, p)); | 1299 | BUG_ON(task_current(rq, p)); |
| 1297 | BUG_ON(p->nr_cpus_allowed <= 1); | 1300 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1298 | 1301 | ||
| 1299 | BUG_ON(!p->on_rq); | 1302 | BUG_ON(!task_on_rq_queued(p)); |
| 1300 | BUG_ON(!dl_task(p)); | 1303 | BUG_ON(!dl_task(p)); |
| 1301 | 1304 | ||
| 1302 | return p; | 1305 | return p; |
| @@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq) | |||
| 1443 | dl_time_before(p->dl.deadline, | 1446 | dl_time_before(p->dl.deadline, |
| 1444 | this_rq->dl.earliest_dl.curr))) { | 1447 | this_rq->dl.earliest_dl.curr))) { |
| 1445 | WARN_ON(p == src_rq->curr); | 1448 | WARN_ON(p == src_rq->curr); |
| 1446 | WARN_ON(!p->on_rq); | 1449 | WARN_ON(!task_on_rq_queued(p)); |
| 1447 | 1450 | ||
| 1448 | /* | 1451 | /* |
| 1449 | * Then we pull iff p has actually an earlier | 1452 | * Then we pull iff p has actually an earlier |
| @@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 1569 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1572 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) |
| 1570 | hrtimer_try_to_cancel(&p->dl.dl_timer); | 1573 | hrtimer_try_to_cancel(&p->dl.dl_timer); |
| 1571 | 1574 | ||
| 1575 | __dl_clear_params(p); | ||
| 1576 | |||
| 1572 | #ifdef CONFIG_SMP | 1577 | #ifdef CONFIG_SMP |
| 1573 | /* | 1578 | /* |
| 1574 | * Since this might be the only -deadline task on the rq, | 1579 | * Since this might be the only -deadline task on the rq, |
| @@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1596 | if (unlikely(p->dl.dl_throttled)) | 1601 | if (unlikely(p->dl.dl_throttled)) |
| 1597 | return; | 1602 | return; |
| 1598 | 1603 | ||
| 1599 | if (p->on_rq && rq->curr != p) { | 1604 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1600 | #ifdef CONFIG_SMP | 1605 | #ifdef CONFIG_SMP |
| 1601 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1606 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) |
| 1602 | /* Only reschedule if pushing failed */ | 1607 | /* Only reschedule if pushing failed */ |
| @@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1614 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | 1619 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, |
| 1615 | int oldprio) | 1620 | int oldprio) |
| 1616 | { | 1621 | { |
| 1617 | if (p->on_rq || rq->curr == p) { | 1622 | if (task_on_rq_queued(p) || rq->curr == p) { |
| 1618 | #ifdef CONFIG_SMP | 1623 | #ifdef CONFIG_SMP |
| 1619 | /* | 1624 | /* |
| 1620 | * This might be too much, but unfortunately | 1625 | * This might be too much, but unfortunately |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..ce33780d8f20 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
| 151 | { | 151 | { |
| 152 | struct task_struct *g, *p; | 152 | struct task_struct *g, *p; |
| 153 | unsigned long flags; | ||
| 154 | 153 | ||
| 155 | SEQ_printf(m, | 154 | SEQ_printf(m, |
| 156 | "\nrunnable tasks:\n" | 155 | "\nrunnable tasks:\n" |
| @@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 159 | "------------------------------------------------------" | 158 | "------------------------------------------------------" |
| 160 | "----------------------------------------------------\n"); | 159 | "----------------------------------------------------\n"); |
| 161 | 160 | ||
| 162 | read_lock_irqsave(&tasklist_lock, flags); | 161 | rcu_read_lock(); |
| 163 | 162 | for_each_process_thread(g, p) { | |
| 164 | do_each_thread(g, p) { | ||
| 165 | if (task_cpu(p) != rq_cpu) | 163 | if (task_cpu(p) != rq_cpu) |
| 166 | continue; | 164 | continue; |
| 167 | 165 | ||
| 168 | print_task(m, rq, p); | 166 | print_task(m, rq, p); |
| 169 | } while_each_thread(g, p); | 167 | } |
| 170 | 168 | rcu_read_unlock(); | |
| 171 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 172 | } | 169 | } |
| 173 | 170 | ||
| 174 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 171 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
| @@ -333,9 +330,7 @@ do { \ | |||
| 333 | print_cfs_stats(m, cpu); | 330 | print_cfs_stats(m, cpu); |
| 334 | print_rt_stats(m, cpu); | 331 | print_rt_stats(m, cpu); |
| 335 | 332 | ||
| 336 | rcu_read_lock(); | ||
| 337 | print_rq(m, rq, cpu); | 333 | print_rq(m, rq, cpu); |
| 338 | rcu_read_unlock(); | ||
| 339 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 334 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
| 340 | SEQ_printf(m, "\n"); | 335 | SEQ_printf(m, "\n"); |
| 341 | } | 336 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82088b29704e..b78280c59b46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
| 26 | #include <linux/cpuidle.h> | ||
| 26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
| 27 | #include <linux/profile.h> | 28 | #include <linux/profile.h> |
| 28 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
| @@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 665 | } | 666 | } |
| 666 | 667 | ||
| 667 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
| 669 | static int select_idle_sibling(struct task_struct *p, int cpu); | ||
| 668 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 669 | 671 | ||
| 670 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| @@ -1038,7 +1040,8 @@ struct numa_stats { | |||
| 1038 | */ | 1040 | */ |
| 1039 | static void update_numa_stats(struct numa_stats *ns, int nid) | 1041 | static void update_numa_stats(struct numa_stats *ns, int nid) |
| 1040 | { | 1042 | { |
| 1041 | int cpu, cpus = 0; | 1043 | int smt, cpu, cpus = 0; |
| 1044 | unsigned long capacity; | ||
| 1042 | 1045 | ||
| 1043 | memset(ns, 0, sizeof(*ns)); | 1046 | memset(ns, 0, sizeof(*ns)); |
| 1044 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1047 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
| @@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
| 1062 | if (!cpus) | 1065 | if (!cpus) |
| 1063 | return; | 1066 | return; |
| 1064 | 1067 | ||
| 1065 | ns->task_capacity = | 1068 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ |
| 1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1069 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); |
| 1070 | capacity = cpus / smt; /* cores */ | ||
| 1071 | |||
| 1072 | ns->task_capacity = min_t(unsigned, capacity, | ||
| 1073 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); | ||
| 1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1074 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
| 1068 | } | 1075 | } |
| 1069 | 1076 | ||
| @@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1206 | 1213 | ||
| 1207 | if (!cur) { | 1214 | if (!cur) { |
| 1208 | /* Is there capacity at our destination? */ | 1215 | /* Is there capacity at our destination? */ |
| 1209 | if (env->src_stats.has_free_capacity && | 1216 | if (env->src_stats.nr_running <= env->src_stats.task_capacity && |
| 1210 | !env->dst_stats.has_free_capacity) | 1217 | !env->dst_stats.has_free_capacity) |
| 1211 | goto unlock; | 1218 | goto unlock; |
| 1212 | 1219 | ||
| @@ -1252,6 +1259,13 @@ balance: | |||
| 1252 | if (load_too_imbalanced(src_load, dst_load, env)) | 1259 | if (load_too_imbalanced(src_load, dst_load, env)) |
| 1253 | goto unlock; | 1260 | goto unlock; |
| 1254 | 1261 | ||
| 1262 | /* | ||
| 1263 | * One idle CPU per node is evaluated for a task numa move. | ||
| 1264 | * Call select_idle_sibling to maybe find a better one. | ||
| 1265 | */ | ||
| 1266 | if (!cur) | ||
| 1267 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | ||
| 1268 | |||
| 1255 | assign: | 1269 | assign: |
| 1256 | task_numa_assign(env, cur, imp); | 1270 | task_numa_assign(env, cur, imp); |
| 1257 | unlock: | 1271 | unlock: |
| @@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p) | |||
| 1775 | list_del(&p->numa_entry); | 1789 | list_del(&p->numa_entry); |
| 1776 | grp->nr_tasks--; | 1790 | grp->nr_tasks--; |
| 1777 | spin_unlock_irqrestore(&grp->lock, flags); | 1791 | spin_unlock_irqrestore(&grp->lock, flags); |
| 1778 | rcu_assign_pointer(p->numa_group, NULL); | 1792 | RCU_INIT_POINTER(p->numa_group, NULL); |
| 1779 | put_numa_group(grp); | 1793 | put_numa_group(grp); |
| 1780 | } | 1794 | } |
| 1781 | 1795 | ||
| @@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1804 | if (!p->mm) | 1818 | if (!p->mm) |
| 1805 | return; | 1819 | return; |
| 1806 | 1820 | ||
| 1807 | /* Do not worry about placement if exiting */ | ||
| 1808 | if (p->state == TASK_DEAD) | ||
| 1809 | return; | ||
| 1810 | |||
| 1811 | /* Allocate buffer to track faults on a per-node basis */ | 1821 | /* Allocate buffer to track faults on a per-node basis */ |
| 1812 | if (unlikely(!p->numa_faults_memory)) { | 1822 | if (unlikely(!p->numa_faults_memory)) { |
| 1813 | int size = sizeof(*p->numa_faults_memory) * | 1823 | int size = sizeof(*p->numa_faults_memory) * |
| @@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
| 2211 | 2221 | ||
| 2212 | /* | 2222 | /* |
| 2213 | * As y^PERIOD = 1/2, we can combine | 2223 | * As y^PERIOD = 1/2, we can combine |
| 2214 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | 2224 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) |
| 2215 | * With a look-up table which covers k^n (n<PERIOD) | 2225 | * With a look-up table which covers y^n (n<PERIOD) |
| 2216 | * | 2226 | * |
| 2217 | * To achieve constant time decay_load. | 2227 | * To achieve constant time decay_load. |
| 2218 | */ | 2228 | */ |
| @@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
| 2377 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 2387 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
| 2378 | tg_contrib -= cfs_rq->tg_load_contrib; | 2388 | tg_contrib -= cfs_rq->tg_load_contrib; |
| 2379 | 2389 | ||
| 2390 | if (!tg_contrib) | ||
| 2391 | return; | ||
| 2392 | |||
| 2380 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 2393 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
| 2381 | atomic_long_add(tg_contrib, &tg->load_avg); | 2394 | atomic_long_add(tg_contrib, &tg->load_avg); |
| 2382 | cfs_rq->tg_load_contrib += tg_contrib; | 2395 | cfs_rq->tg_load_contrib += tg_contrib; |
| @@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 3892 | resched_curr(rq); | 3905 | resched_curr(rq); |
| 3893 | return; | 3906 | return; |
| 3894 | } | 3907 | } |
| 3895 | |||
| 3896 | /* | ||
| 3897 | * Don't schedule slices shorter than 10000ns, that just | ||
| 3898 | * doesn't make sense. Rely on vruntime for fairness. | ||
| 3899 | */ | ||
| 3900 | if (rq->curr != p) | ||
| 3901 | delta = max_t(s64, 10000LL, delta); | ||
| 3902 | |||
| 3903 | hrtick_start(rq, delta); | 3908 | hrtick_start(rq, delta); |
| 3904 | } | 3909 | } |
| 3905 | } | 3910 | } |
| @@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu) | |||
| 4087 | static unsigned long cpu_avg_load_per_task(int cpu) | 4092 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4088 | { | 4093 | { |
| 4089 | struct rq *rq = cpu_rq(cpu); | 4094 | struct rq *rq = cpu_rq(cpu); |
| 4090 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 4095 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); |
| 4091 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4096 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
| 4092 | 4097 | ||
| 4093 | if (nr_running) | 4098 | if (nr_running) |
| @@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p) | |||
| 4276 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 4281 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
| 4277 | { | 4282 | { |
| 4278 | s64 this_load, load; | 4283 | s64 this_load, load; |
| 4284 | s64 this_eff_load, prev_eff_load; | ||
| 4279 | int idx, this_cpu, prev_cpu; | 4285 | int idx, this_cpu, prev_cpu; |
| 4280 | unsigned long tl_per_task; | ||
| 4281 | struct task_group *tg; | 4286 | struct task_group *tg; |
| 4282 | unsigned long weight; | 4287 | unsigned long weight; |
| 4283 | int balanced; | 4288 | int balanced; |
| @@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 4320 | * Otherwise check if either cpus are near enough in load to allow this | 4325 | * Otherwise check if either cpus are near enough in load to allow this |
| 4321 | * task to be woken on this_cpu. | 4326 | * task to be woken on this_cpu. |
| 4322 | */ | 4327 | */ |
| 4323 | if (this_load > 0) { | 4328 | this_eff_load = 100; |
| 4324 | s64 this_eff_load, prev_eff_load; | 4329 | this_eff_load *= capacity_of(prev_cpu); |
| 4325 | 4330 | ||
| 4326 | this_eff_load = 100; | 4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; |
| 4327 | this_eff_load *= capacity_of(prev_cpu); | 4332 | prev_eff_load *= capacity_of(this_cpu); |
| 4333 | |||
| 4334 | if (this_load > 0) { | ||
| 4328 | this_eff_load *= this_load + | 4335 | this_eff_load *= this_load + |
| 4329 | effective_load(tg, this_cpu, weight, weight); | 4336 | effective_load(tg, this_cpu, weight, weight); |
| 4330 | 4337 | ||
| 4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 4332 | prev_eff_load *= capacity_of(this_cpu); | ||
| 4333 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4338 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
| 4339 | } | ||
| 4334 | 4340 | ||
| 4335 | balanced = this_eff_load <= prev_eff_load; | 4341 | balanced = this_eff_load <= prev_eff_load; |
| 4336 | } else | ||
| 4337 | balanced = true; | ||
| 4338 | |||
| 4339 | /* | ||
| 4340 | * If the currently running task will sleep within | ||
| 4341 | * a reasonable amount of time then attract this newly | ||
| 4342 | * woken task: | ||
| 4343 | */ | ||
| 4344 | if (sync && balanced) | ||
| 4345 | return 1; | ||
| 4346 | 4342 | ||
| 4347 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 4343 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); |
| 4348 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 4349 | 4344 | ||
| 4350 | if (balanced || | 4345 | if (!balanced) |
| 4351 | (this_load <= load && | 4346 | return 0; |
| 4352 | this_load + target_load(prev_cpu, idx) <= tl_per_task)) { | ||
| 4353 | /* | ||
| 4354 | * This domain has SD_WAKE_AFFINE and | ||
| 4355 | * p is cache cold in this domain, and | ||
| 4356 | * there is no bad imbalance. | ||
| 4357 | */ | ||
| 4358 | schedstat_inc(sd, ttwu_move_affine); | ||
| 4359 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | ||
| 4360 | 4347 | ||
| 4361 | return 1; | 4348 | schedstat_inc(sd, ttwu_move_affine); |
| 4362 | } | 4349 | schedstat_inc(p, se.statistics.nr_wakeups_affine); |
| 4363 | return 0; | 4350 | |
| 4351 | return 1; | ||
| 4364 | } | 4352 | } |
| 4365 | 4353 | ||
| 4366 | /* | 4354 | /* |
| @@ -4428,20 +4416,46 @@ static int | |||
| 4428 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 4416 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
| 4429 | { | 4417 | { |
| 4430 | unsigned long load, min_load = ULONG_MAX; | 4418 | unsigned long load, min_load = ULONG_MAX; |
| 4431 | int idlest = -1; | 4419 | unsigned int min_exit_latency = UINT_MAX; |
| 4420 | u64 latest_idle_timestamp = 0; | ||
| 4421 | int least_loaded_cpu = this_cpu; | ||
| 4422 | int shallowest_idle_cpu = -1; | ||
| 4432 | int i; | 4423 | int i; |
| 4433 | 4424 | ||
| 4434 | /* Traverse only the allowed CPUs */ | 4425 | /* Traverse only the allowed CPUs */ |
| 4435 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 4426 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
| 4436 | load = weighted_cpuload(i); | 4427 | if (idle_cpu(i)) { |
| 4437 | 4428 | struct rq *rq = cpu_rq(i); | |
| 4438 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4429 | struct cpuidle_state *idle = idle_get_state(rq); |
| 4439 | min_load = load; | 4430 | if (idle && idle->exit_latency < min_exit_latency) { |
| 4440 | idlest = i; | 4431 | /* |
| 4432 | * We give priority to a CPU whose idle state | ||
| 4433 | * has the smallest exit latency irrespective | ||
| 4434 | * of any idle timestamp. | ||
| 4435 | */ | ||
| 4436 | min_exit_latency = idle->exit_latency; | ||
| 4437 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4438 | shallowest_idle_cpu = i; | ||
| 4439 | } else if ((!idle || idle->exit_latency == min_exit_latency) && | ||
| 4440 | rq->idle_stamp > latest_idle_timestamp) { | ||
| 4441 | /* | ||
| 4442 | * If equal or no active idle state, then | ||
| 4443 | * the most recently idled CPU might have | ||
| 4444 | * a warmer cache. | ||
| 4445 | */ | ||
| 4446 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4447 | shallowest_idle_cpu = i; | ||
| 4448 | } | ||
| 4449 | } else { | ||
| 4450 | load = weighted_cpuload(i); | ||
| 4451 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
| 4452 | min_load = load; | ||
| 4453 | least_loaded_cpu = i; | ||
| 4454 | } | ||
| 4441 | } | 4455 | } |
| 4442 | } | 4456 | } |
| 4443 | 4457 | ||
| 4444 | return idlest; | 4458 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
| 4445 | } | 4459 | } |
| 4446 | 4460 | ||
| 4447 | /* | 4461 | /* |
| @@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4513 | if (p->nr_cpus_allowed == 1) | 4527 | if (p->nr_cpus_allowed == 1) |
| 4514 | return prev_cpu; | 4528 | return prev_cpu; |
| 4515 | 4529 | ||
| 4516 | if (sd_flag & SD_BALANCE_WAKE) { | 4530 | if (sd_flag & SD_BALANCE_WAKE) |
| 4517 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 4531 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
| 4518 | want_affine = 1; | ||
| 4519 | new_cpu = prev_cpu; | ||
| 4520 | } | ||
| 4521 | 4532 | ||
| 4522 | rcu_read_lock(); | 4533 | rcu_read_lock(); |
| 4523 | for_each_domain(cpu, tmp) { | 4534 | for_each_domain(cpu, tmp) { |
| @@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 4704 | return; | 4715 | return; |
| 4705 | 4716 | ||
| 4706 | /* | 4717 | /* |
| 4707 | * This is possible from callers such as move_task(), in which we | 4718 | * This is possible from callers such as attach_tasks(), in which we |
| 4708 | * unconditionally check_prempt_curr() after an enqueue (which may have | 4719 | * unconditionally check_prempt_curr() after an enqueue (which may have |
| 4709 | * lead to a throttle). This both saves work and prevents false | 4720 | * lead to a throttle). This both saves work and prevents false |
| 4710 | * next-buddy nomination below. | 4721 | * next-buddy nomination below. |
| @@ -5112,27 +5123,18 @@ struct lb_env { | |||
| 5112 | unsigned int loop_max; | 5123 | unsigned int loop_max; |
| 5113 | 5124 | ||
| 5114 | enum fbq_type fbq_type; | 5125 | enum fbq_type fbq_type; |
| 5126 | struct list_head tasks; | ||
| 5115 | }; | 5127 | }; |
| 5116 | 5128 | ||
| 5117 | /* | 5129 | /* |
| 5118 | * move_task - move a task from one runqueue to another runqueue. | ||
| 5119 | * Both runqueues must be locked. | ||
| 5120 | */ | ||
| 5121 | static void move_task(struct task_struct *p, struct lb_env *env) | ||
| 5122 | { | ||
| 5123 | deactivate_task(env->src_rq, p, 0); | ||
| 5124 | set_task_cpu(p, env->dst_cpu); | ||
| 5125 | activate_task(env->dst_rq, p, 0); | ||
| 5126 | check_preempt_curr(env->dst_rq, p, 0); | ||
| 5127 | } | ||
| 5128 | |||
| 5129 | /* | ||
| 5130 | * Is this task likely cache-hot: | 5130 | * Is this task likely cache-hot: |
| 5131 | */ | 5131 | */ |
| 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) | 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) |
| 5133 | { | 5133 | { |
| 5134 | s64 delta; | 5134 | s64 delta; |
| 5135 | 5135 | ||
| 5136 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5137 | |||
| 5136 | if (p->sched_class != &fair_sched_class) | 5138 | if (p->sched_class != &fair_sched_class) |
| 5137 | return 0; | 5139 | return 0; |
| 5138 | 5140 | ||
| @@ -5252,6 +5254,9 @@ static | |||
| 5252 | int can_migrate_task(struct task_struct *p, struct lb_env *env) | 5254 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
| 5253 | { | 5255 | { |
| 5254 | int tsk_cache_hot = 0; | 5256 | int tsk_cache_hot = 0; |
| 5257 | |||
| 5258 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5259 | |||
| 5255 | /* | 5260 | /* |
| 5256 | * We do not migrate tasks that are: | 5261 | * We do not migrate tasks that are: |
| 5257 | * 1) throttled_lb_pair, or | 5262 | * 1) throttled_lb_pair, or |
| @@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5310 | if (!tsk_cache_hot) | 5315 | if (!tsk_cache_hot) |
| 5311 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5316 | tsk_cache_hot = migrate_degrades_locality(p, env); |
| 5312 | 5317 | ||
| 5313 | if (migrate_improves_locality(p, env)) { | 5318 | if (migrate_improves_locality(p, env) || !tsk_cache_hot || |
| 5314 | #ifdef CONFIG_SCHEDSTATS | 5319 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 5315 | if (tsk_cache_hot) { | 5320 | if (tsk_cache_hot) { |
| 5316 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 5321 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
| 5317 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 5322 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
| 5318 | } | 5323 | } |
| 5319 | #endif | ||
| 5320 | return 1; | ||
| 5321 | } | ||
| 5322 | |||
| 5323 | if (!tsk_cache_hot || | ||
| 5324 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | ||
| 5325 | |||
| 5326 | if (tsk_cache_hot) { | ||
| 5327 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 5328 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 5329 | } | ||
| 5330 | |||
| 5331 | return 1; | 5324 | return 1; |
| 5332 | } | 5325 | } |
| 5333 | 5326 | ||
| @@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5336 | } | 5329 | } |
| 5337 | 5330 | ||
| 5338 | /* | 5331 | /* |
| 5339 | * move_one_task tries to move exactly one task from busiest to this_rq, as | 5332 | * detach_task() -- detach the task for the migration specified in env |
| 5333 | */ | ||
| 5334 | static void detach_task(struct task_struct *p, struct lb_env *env) | ||
| 5335 | { | ||
| 5336 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5337 | |||
| 5338 | deactivate_task(env->src_rq, p, 0); | ||
| 5339 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 5340 | set_task_cpu(p, env->dst_cpu); | ||
| 5341 | } | ||
| 5342 | |||
| 5343 | /* | ||
| 5344 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as | ||
| 5340 | * part of active balancing operations within "domain". | 5345 | * part of active balancing operations within "domain". |
| 5341 | * Returns 1 if successful and 0 otherwise. | ||
| 5342 | * | 5346 | * |
| 5343 | * Called with both runqueues locked. | 5347 | * Returns a task if successful and NULL otherwise. |
| 5344 | */ | 5348 | */ |
| 5345 | static int move_one_task(struct lb_env *env) | 5349 | static struct task_struct *detach_one_task(struct lb_env *env) |
| 5346 | { | 5350 | { |
| 5347 | struct task_struct *p, *n; | 5351 | struct task_struct *p, *n; |
| 5348 | 5352 | ||
| 5353 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5354 | |||
| 5349 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 5355 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
| 5350 | if (!can_migrate_task(p, env)) | 5356 | if (!can_migrate_task(p, env)) |
| 5351 | continue; | 5357 | continue; |
| 5352 | 5358 | ||
| 5353 | move_task(p, env); | 5359 | detach_task(p, env); |
| 5360 | |||
| 5354 | /* | 5361 | /* |
| 5355 | * Right now, this is only the second place move_task() | 5362 | * Right now, this is only the second place where |
| 5356 | * is called, so we can safely collect move_task() | 5363 | * lb_gained[env->idle] is updated (other is detach_tasks) |
| 5357 | * stats here rather than inside move_task(). | 5364 | * so we can safely collect stats here rather than |
| 5365 | * inside detach_tasks(). | ||
| 5358 | */ | 5366 | */ |
| 5359 | schedstat_inc(env->sd, lb_gained[env->idle]); | 5367 | schedstat_inc(env->sd, lb_gained[env->idle]); |
| 5360 | return 1; | 5368 | return p; |
| 5361 | } | 5369 | } |
| 5362 | return 0; | 5370 | return NULL; |
| 5363 | } | 5371 | } |
| 5364 | 5372 | ||
| 5365 | static const unsigned int sched_nr_migrate_break = 32; | 5373 | static const unsigned int sched_nr_migrate_break = 32; |
| 5366 | 5374 | ||
| 5367 | /* | 5375 | /* |
| 5368 | * move_tasks tries to move up to imbalance weighted load from busiest to | 5376 | * detach_tasks() -- tries to detach up to imbalance weighted load from |
| 5369 | * this_rq, as part of a balancing operation within domain "sd". | 5377 | * busiest_rq, as part of a balancing operation within domain "sd". |
| 5370 | * Returns 1 if successful and 0 otherwise. | ||
| 5371 | * | 5378 | * |
| 5372 | * Called with both runqueues locked. | 5379 | * Returns number of detached tasks if successful and 0 otherwise. |
| 5373 | */ | 5380 | */ |
| 5374 | static int move_tasks(struct lb_env *env) | 5381 | static int detach_tasks(struct lb_env *env) |
| 5375 | { | 5382 | { |
| 5376 | struct list_head *tasks = &env->src_rq->cfs_tasks; | 5383 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
| 5377 | struct task_struct *p; | 5384 | struct task_struct *p; |
| 5378 | unsigned long load; | 5385 | unsigned long load; |
| 5379 | int pulled = 0; | 5386 | int detached = 0; |
| 5387 | |||
| 5388 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5380 | 5389 | ||
| 5381 | if (env->imbalance <= 0) | 5390 | if (env->imbalance <= 0) |
| 5382 | return 0; | 5391 | return 0; |
| @@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env) | |||
| 5407 | if ((load / 2) > env->imbalance) | 5416 | if ((load / 2) > env->imbalance) |
| 5408 | goto next; | 5417 | goto next; |
| 5409 | 5418 | ||
| 5410 | move_task(p, env); | 5419 | detach_task(p, env); |
| 5411 | pulled++; | 5420 | list_add(&p->se.group_node, &env->tasks); |
| 5421 | |||
| 5422 | detached++; | ||
| 5412 | env->imbalance -= load; | 5423 | env->imbalance -= load; |
| 5413 | 5424 | ||
| 5414 | #ifdef CONFIG_PREEMPT | 5425 | #ifdef CONFIG_PREEMPT |
| 5415 | /* | 5426 | /* |
| 5416 | * NEWIDLE balancing is a source of latency, so preemptible | 5427 | * NEWIDLE balancing is a source of latency, so preemptible |
| 5417 | * kernels will stop after the first task is pulled to minimize | 5428 | * kernels will stop after the first task is detached to minimize |
| 5418 | * the critical section. | 5429 | * the critical section. |
| 5419 | */ | 5430 | */ |
| 5420 | if (env->idle == CPU_NEWLY_IDLE) | 5431 | if (env->idle == CPU_NEWLY_IDLE) |
| @@ -5434,13 +5445,58 @@ next: | |||
| 5434 | } | 5445 | } |
| 5435 | 5446 | ||
| 5436 | /* | 5447 | /* |
| 5437 | * Right now, this is one of only two places move_task() is called, | 5448 | * Right now, this is one of only two places we collect this stat |
| 5438 | * so we can safely collect move_task() stats here rather than | 5449 | * so we can safely collect detach_one_task() stats here rather |
| 5439 | * inside move_task(). | 5450 | * than inside detach_one_task(). |
| 5440 | */ | 5451 | */ |
| 5441 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 5452 | schedstat_add(env->sd, lb_gained[env->idle], detached); |
| 5453 | |||
| 5454 | return detached; | ||
| 5455 | } | ||
| 5456 | |||
| 5457 | /* | ||
| 5458 | * attach_task() -- attach the task detached by detach_task() to its new rq. | ||
| 5459 | */ | ||
| 5460 | static void attach_task(struct rq *rq, struct task_struct *p) | ||
| 5461 | { | ||
| 5462 | lockdep_assert_held(&rq->lock); | ||
| 5463 | |||
| 5464 | BUG_ON(task_rq(p) != rq); | ||
| 5465 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 5466 | activate_task(rq, p, 0); | ||
| 5467 | check_preempt_curr(rq, p, 0); | ||
| 5468 | } | ||
| 5469 | |||
| 5470 | /* | ||
| 5471 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
| 5472 | * its new rq. | ||
| 5473 | */ | ||
| 5474 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
| 5475 | { | ||
| 5476 | raw_spin_lock(&rq->lock); | ||
| 5477 | attach_task(rq, p); | ||
| 5478 | raw_spin_unlock(&rq->lock); | ||
| 5479 | } | ||
| 5480 | |||
| 5481 | /* | ||
| 5482 | * attach_tasks() -- attaches all tasks detached by detach_tasks() to their | ||
| 5483 | * new rq. | ||
| 5484 | */ | ||
| 5485 | static void attach_tasks(struct lb_env *env) | ||
| 5486 | { | ||
| 5487 | struct list_head *tasks = &env->tasks; | ||
| 5488 | struct task_struct *p; | ||
| 5489 | |||
| 5490 | raw_spin_lock(&env->dst_rq->lock); | ||
| 5491 | |||
| 5492 | while (!list_empty(tasks)) { | ||
| 5493 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
| 5494 | list_del_init(&p->se.group_node); | ||
| 5442 | 5495 | ||
| 5443 | return pulled; | 5496 | attach_task(env->dst_rq, p); |
| 5497 | } | ||
| 5498 | |||
| 5499 | raw_spin_unlock(&env->dst_rq->lock); | ||
| 5444 | } | 5500 | } |
| 5445 | 5501 | ||
| 5446 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5502 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 5559 | #endif | 5615 | #endif |
| 5560 | 5616 | ||
| 5561 | /********** Helpers for find_busiest_group ************************/ | 5617 | /********** Helpers for find_busiest_group ************************/ |
| 5618 | |||
| 5619 | enum group_type { | ||
| 5620 | group_other = 0, | ||
| 5621 | group_imbalanced, | ||
| 5622 | group_overloaded, | ||
| 5623 | }; | ||
| 5624 | |||
| 5562 | /* | 5625 | /* |
| 5563 | * sg_lb_stats - stats of a sched_group required for load_balancing | 5626 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 5564 | */ | 5627 | */ |
| @@ -5572,7 +5635,7 @@ struct sg_lb_stats { | |||
| 5572 | unsigned int group_capacity_factor; | 5635 | unsigned int group_capacity_factor; |
| 5573 | unsigned int idle_cpus; | 5636 | unsigned int idle_cpus; |
| 5574 | unsigned int group_weight; | 5637 | unsigned int group_weight; |
| 5575 | int group_imb; /* Is there an imbalance in the group ? */ | 5638 | enum group_type group_type; |
| 5576 | int group_has_free_capacity; | 5639 | int group_has_free_capacity; |
| 5577 | #ifdef CONFIG_NUMA_BALANCING | 5640 | #ifdef CONFIG_NUMA_BALANCING |
| 5578 | unsigned int nr_numa_running; | 5641 | unsigned int nr_numa_running; |
| @@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 5610 | .total_capacity = 0UL, | 5673 | .total_capacity = 0UL, |
| 5611 | .busiest_stat = { | 5674 | .busiest_stat = { |
| 5612 | .avg_load = 0UL, | 5675 | .avg_load = 0UL, |
| 5676 | .sum_nr_running = 0, | ||
| 5677 | .group_type = group_other, | ||
| 5613 | }, | 5678 | }, |
| 5614 | }; | 5679 | }; |
| 5615 | } | 5680 | } |
| @@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
| 5652 | return default_scale_capacity(sd, cpu); | 5717 | return default_scale_capacity(sd, cpu); |
| 5653 | } | 5718 | } |
| 5654 | 5719 | ||
| 5655 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5720 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5656 | { | 5721 | { |
| 5657 | unsigned long weight = sd->span_weight; | 5722 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| 5658 | unsigned long smt_gain = sd->smt_gain; | 5723 | return sd->smt_gain / sd->span_weight; |
| 5659 | 5724 | ||
| 5660 | smt_gain /= weight; | 5725 | return SCHED_CAPACITY_SCALE; |
| 5661 | |||
| 5662 | return smt_gain; | ||
| 5663 | } | 5726 | } |
| 5664 | 5727 | ||
| 5665 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5728 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5666 | { | 5729 | { |
| 5667 | return default_scale_smt_capacity(sd, cpu); | 5730 | return default_scale_cpu_capacity(sd, cpu); |
| 5668 | } | 5731 | } |
| 5669 | 5732 | ||
| 5670 | static unsigned long scale_rt_capacity(int cpu) | 5733 | static unsigned long scale_rt_capacity(int cpu) |
| @@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5703 | 5766 | ||
| 5704 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 5767 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5705 | { | 5768 | { |
| 5706 | unsigned long weight = sd->span_weight; | ||
| 5707 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 5769 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
| 5708 | struct sched_group *sdg = sd->groups; | 5770 | struct sched_group *sdg = sd->groups; |
| 5709 | 5771 | ||
| 5710 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { | 5772 | if (sched_feat(ARCH_CAPACITY)) |
| 5711 | if (sched_feat(ARCH_CAPACITY)) | 5773 | capacity *= arch_scale_cpu_capacity(sd, cpu); |
| 5712 | capacity *= arch_scale_smt_capacity(sd, cpu); | 5774 | else |
| 5713 | else | 5775 | capacity *= default_scale_cpu_capacity(sd, cpu); |
| 5714 | capacity *= default_scale_smt_capacity(sd, cpu); | ||
| 5715 | 5776 | ||
| 5716 | capacity >>= SCHED_CAPACITY_SHIFT; | 5777 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5717 | } | ||
| 5718 | 5778 | ||
| 5719 | sdg->sgc->capacity_orig = capacity; | 5779 | sdg->sgc->capacity_orig = capacity; |
| 5720 | 5780 | ||
| @@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
| 5891 | return capacity_factor; | 5951 | return capacity_factor; |
| 5892 | } | 5952 | } |
| 5893 | 5953 | ||
| 5954 | static enum group_type | ||
| 5955 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | ||
| 5956 | { | ||
| 5957 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | ||
| 5958 | return group_overloaded; | ||
| 5959 | |||
| 5960 | if (sg_imbalanced(group)) | ||
| 5961 | return group_imbalanced; | ||
| 5962 | |||
| 5963 | return group_other; | ||
| 5964 | } | ||
| 5965 | |||
| 5894 | /** | 5966 | /** |
| 5895 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 5967 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 5896 | * @env: The load balancing environment. | 5968 | * @env: The load balancing environment. |
| @@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5920 | load = source_load(i, load_idx); | 5992 | load = source_load(i, load_idx); |
| 5921 | 5993 | ||
| 5922 | sgs->group_load += load; | 5994 | sgs->group_load += load; |
| 5923 | sgs->sum_nr_running += rq->nr_running; | 5995 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 5924 | 5996 | ||
| 5925 | if (rq->nr_running > 1) | 5997 | if (rq->nr_running > 1) |
| 5926 | *overload = true; | 5998 | *overload = true; |
| @@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5942 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6014 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 5943 | 6015 | ||
| 5944 | sgs->group_weight = group->group_weight; | 6016 | sgs->group_weight = group->group_weight; |
| 5945 | |||
| 5946 | sgs->group_imb = sg_imbalanced(group); | ||
| 5947 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | 6017 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
| 6018 | sgs->group_type = group_classify(group, sgs); | ||
| 5948 | 6019 | ||
| 5949 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6020 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
| 5950 | sgs->group_has_free_capacity = 1; | 6021 | sgs->group_has_free_capacity = 1; |
| @@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5968 | struct sched_group *sg, | 6039 | struct sched_group *sg, |
| 5969 | struct sg_lb_stats *sgs) | 6040 | struct sg_lb_stats *sgs) |
| 5970 | { | 6041 | { |
| 5971 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 6042 | struct sg_lb_stats *busiest = &sds->busiest_stat; |
| 5972 | return false; | ||
| 5973 | 6043 | ||
| 5974 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6044 | if (sgs->group_type > busiest->group_type) |
| 5975 | return true; | 6045 | return true; |
| 5976 | 6046 | ||
| 5977 | if (sgs->group_imb) | 6047 | if (sgs->group_type < busiest->group_type) |
| 6048 | return false; | ||
| 6049 | |||
| 6050 | if (sgs->avg_load <= busiest->avg_load) | ||
| 6051 | return false; | ||
| 6052 | |||
| 6053 | /* This is the busiest node in its class. */ | ||
| 6054 | if (!(env->sd->flags & SD_ASYM_PACKING)) | ||
| 5978 | return true; | 6055 | return true; |
| 5979 | 6056 | ||
| 5980 | /* | 6057 | /* |
| @@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5982 | * numbered CPUs in the group, therefore mark all groups | 6059 | * numbered CPUs in the group, therefore mark all groups |
| 5983 | * higher than ourself as busy. | 6060 | * higher than ourself as busy. |
| 5984 | */ | 6061 | */ |
| 5985 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 6062 | if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { |
| 5986 | env->dst_cpu < group_first_cpu(sg)) { | ||
| 5987 | if (!sds->busiest) | 6063 | if (!sds->busiest) |
| 5988 | return true; | 6064 | return true; |
| 5989 | 6065 | ||
| @@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6228 | local = &sds->local_stat; | 6304 | local = &sds->local_stat; |
| 6229 | busiest = &sds->busiest_stat; | 6305 | busiest = &sds->busiest_stat; |
| 6230 | 6306 | ||
| 6231 | if (busiest->group_imb) { | 6307 | if (busiest->group_type == group_imbalanced) { |
| 6232 | /* | 6308 | /* |
| 6233 | * In the group_imb case we cannot rely on group-wide averages | 6309 | * In the group_imb case we cannot rely on group-wide averages |
| 6234 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 6310 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
| @@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6248 | return fix_small_imbalance(env, sds); | 6324 | return fix_small_imbalance(env, sds); |
| 6249 | } | 6325 | } |
| 6250 | 6326 | ||
| 6251 | if (!busiest->group_imb) { | 6327 | /* |
| 6252 | /* | 6328 | * If there aren't any idle cpus, avoid creating some. |
| 6253 | * Don't want to pull so many tasks that a group would go idle. | 6329 | */ |
| 6254 | * Except of course for the group_imb case, since then we might | 6330 | if (busiest->group_type == group_overloaded && |
| 6255 | * have to drop below capacity to reach cpu-load equilibrium. | 6331 | local->group_type == group_overloaded) { |
| 6256 | */ | ||
| 6257 | load_above_capacity = | 6332 | load_above_capacity = |
| 6258 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6333 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
| 6259 | 6334 | ||
| @@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6337 | * work because they assume all things are equal, which typically | 6412 | * work because they assume all things are equal, which typically |
| 6338 | * isn't true due to cpus_allowed constraints and the like. | 6413 | * isn't true due to cpus_allowed constraints and the like. |
| 6339 | */ | 6414 | */ |
| 6340 | if (busiest->group_imb) | 6415 | if (busiest->group_type == group_imbalanced) |
| 6341 | goto force_balance; | 6416 | goto force_balance; |
| 6342 | 6417 | ||
| 6343 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6418 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| @@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6346 | goto force_balance; | 6421 | goto force_balance; |
| 6347 | 6422 | ||
| 6348 | /* | 6423 | /* |
| 6349 | * If the local group is more busy than the selected busiest group | 6424 | * If the local group is busier than the selected busiest group |
| 6350 | * don't try and pull any tasks. | 6425 | * don't try and pull any tasks. |
| 6351 | */ | 6426 | */ |
| 6352 | if (local->avg_load >= busiest->avg_load) | 6427 | if (local->avg_load >= busiest->avg_load) |
| @@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6361 | 6436 | ||
| 6362 | if (env->idle == CPU_IDLE) { | 6437 | if (env->idle == CPU_IDLE) { |
| 6363 | /* | 6438 | /* |
| 6364 | * This cpu is idle. If the busiest group load doesn't | 6439 | * This cpu is idle. If the busiest group is not overloaded |
| 6365 | * have more tasks than the number of available cpu's and | 6440 | * and there is no imbalance between this and busiest group |
| 6366 | * there is no imbalance between this and busiest group | 6441 | * wrt idle cpus, it is balanced. The imbalance becomes |
| 6367 | * wrt to idle cpu's, it is balanced. | 6442 | * significant if the diff is greater than 1 otherwise we |
| 6443 | * might end up to just move the imbalance on another group | ||
| 6368 | */ | 6444 | */ |
| 6369 | if ((local->idle_cpus < busiest->idle_cpus) && | 6445 | if ((busiest->group_type != group_overloaded) && |
| 6370 | busiest->sum_nr_running <= busiest->group_weight) | 6446 | (local->idle_cpus <= (busiest->idle_cpus + 1))) |
| 6371 | goto out_balanced; | 6447 | goto out_balanced; |
| 6372 | } else { | 6448 | } else { |
| 6373 | /* | 6449 | /* |
| @@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 6550 | .loop_break = sched_nr_migrate_break, | 6626 | .loop_break = sched_nr_migrate_break, |
| 6551 | .cpus = cpus, | 6627 | .cpus = cpus, |
| 6552 | .fbq_type = all, | 6628 | .fbq_type = all, |
| 6629 | .tasks = LIST_HEAD_INIT(env.tasks), | ||
| 6553 | }; | 6630 | }; |
| 6554 | 6631 | ||
| 6555 | /* | 6632 | /* |
| @@ -6599,23 +6676,30 @@ redo: | |||
| 6599 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6676 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6600 | 6677 | ||
| 6601 | more_balance: | 6678 | more_balance: |
| 6602 | local_irq_save(flags); | 6679 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 6603 | double_rq_lock(env.dst_rq, busiest); | ||
| 6604 | 6680 | ||
| 6605 | /* | 6681 | /* |
| 6606 | * cur_ld_moved - load moved in current iteration | 6682 | * cur_ld_moved - load moved in current iteration |
| 6607 | * ld_moved - cumulative load moved across iterations | 6683 | * ld_moved - cumulative load moved across iterations |
| 6608 | */ | 6684 | */ |
| 6609 | cur_ld_moved = move_tasks(&env); | 6685 | cur_ld_moved = detach_tasks(&env); |
| 6610 | ld_moved += cur_ld_moved; | ||
| 6611 | double_rq_unlock(env.dst_rq, busiest); | ||
| 6612 | local_irq_restore(flags); | ||
| 6613 | 6686 | ||
| 6614 | /* | 6687 | /* |
| 6615 | * some other cpu did the load balance for us. | 6688 | * We've detached some tasks from busiest_rq. Every |
| 6689 | * task is masked "TASK_ON_RQ_MIGRATING", so we can safely | ||
| 6690 | * unlock busiest->lock, and we are able to be sure | ||
| 6691 | * that nobody can manipulate the tasks in parallel. | ||
| 6692 | * See task_rq_lock() family for the details. | ||
| 6616 | */ | 6693 | */ |
| 6617 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 6694 | |
| 6618 | resched_cpu(env.dst_cpu); | 6695 | raw_spin_unlock(&busiest->lock); |
| 6696 | |||
| 6697 | if (cur_ld_moved) { | ||
| 6698 | attach_tasks(&env); | ||
| 6699 | ld_moved += cur_ld_moved; | ||
| 6700 | } | ||
| 6701 | |||
| 6702 | local_irq_restore(flags); | ||
| 6619 | 6703 | ||
| 6620 | if (env.flags & LBF_NEED_BREAK) { | 6704 | if (env.flags & LBF_NEED_BREAK) { |
| 6621 | env.flags &= ~LBF_NEED_BREAK; | 6705 | env.flags &= ~LBF_NEED_BREAK; |
| @@ -6665,10 +6749,8 @@ more_balance: | |||
| 6665 | if (sd_parent) { | 6749 | if (sd_parent) { |
| 6666 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | 6750 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
| 6667 | 6751 | ||
| 6668 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6752 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) |
| 6669 | *group_imbalance = 1; | 6753 | *group_imbalance = 1; |
| 6670 | } else if (*group_imbalance) | ||
| 6671 | *group_imbalance = 0; | ||
| 6672 | } | 6754 | } |
| 6673 | 6755 | ||
| 6674 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6756 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| @@ -6679,7 +6761,7 @@ more_balance: | |||
| 6679 | env.loop_break = sched_nr_migrate_break; | 6761 | env.loop_break = sched_nr_migrate_break; |
| 6680 | goto redo; | 6762 | goto redo; |
| 6681 | } | 6763 | } |
| 6682 | goto out_balanced; | 6764 | goto out_all_pinned; |
| 6683 | } | 6765 | } |
| 6684 | } | 6766 | } |
| 6685 | 6767 | ||
| @@ -6744,7 +6826,7 @@ more_balance: | |||
| 6744 | * If we've begun active balancing, start to back off. This | 6826 | * If we've begun active balancing, start to back off. This |
| 6745 | * case may not be covered by the all_pinned logic if there | 6827 | * case may not be covered by the all_pinned logic if there |
| 6746 | * is only 1 task on the busy runqueue (because we don't call | 6828 | * is only 1 task on the busy runqueue (because we don't call |
| 6747 | * move_tasks). | 6829 | * detach_tasks). |
| 6748 | */ | 6830 | */ |
| 6749 | if (sd->balance_interval < sd->max_interval) | 6831 | if (sd->balance_interval < sd->max_interval) |
| 6750 | sd->balance_interval *= 2; | 6832 | sd->balance_interval *= 2; |
| @@ -6753,6 +6835,23 @@ more_balance: | |||
| 6753 | goto out; | 6835 | goto out; |
| 6754 | 6836 | ||
| 6755 | out_balanced: | 6837 | out_balanced: |
| 6838 | /* | ||
| 6839 | * We reach balance although we may have faced some affinity | ||
| 6840 | * constraints. Clear the imbalance flag if it was set. | ||
| 6841 | */ | ||
| 6842 | if (sd_parent) { | ||
| 6843 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | ||
| 6844 | |||
| 6845 | if (*group_imbalance) | ||
| 6846 | *group_imbalance = 0; | ||
| 6847 | } | ||
| 6848 | |||
| 6849 | out_all_pinned: | ||
| 6850 | /* | ||
| 6851 | * We reach balance because all tasks are pinned at this level so | ||
| 6852 | * we can't migrate them. Let the imbalance flag set so parent level | ||
| 6853 | * can try to migrate them. | ||
| 6854 | */ | ||
| 6756 | schedstat_inc(sd, lb_balanced[idle]); | 6855 | schedstat_inc(sd, lb_balanced[idle]); |
| 6757 | 6856 | ||
| 6758 | sd->nr_balance_failed = 0; | 6857 | sd->nr_balance_failed = 0; |
| @@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6914 | int target_cpu = busiest_rq->push_cpu; | 7013 | int target_cpu = busiest_rq->push_cpu; |
| 6915 | struct rq *target_rq = cpu_rq(target_cpu); | 7014 | struct rq *target_rq = cpu_rq(target_cpu); |
| 6916 | struct sched_domain *sd; | 7015 | struct sched_domain *sd; |
| 7016 | struct task_struct *p = NULL; | ||
| 6917 | 7017 | ||
| 6918 | raw_spin_lock_irq(&busiest_rq->lock); | 7018 | raw_spin_lock_irq(&busiest_rq->lock); |
| 6919 | 7019 | ||
| @@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6933 | */ | 7033 | */ |
| 6934 | BUG_ON(busiest_rq == target_rq); | 7034 | BUG_ON(busiest_rq == target_rq); |
| 6935 | 7035 | ||
| 6936 | /* move a task from busiest_rq to target_rq */ | ||
| 6937 | double_lock_balance(busiest_rq, target_rq); | ||
| 6938 | |||
| 6939 | /* Search for an sd spanning us and the target CPU. */ | 7036 | /* Search for an sd spanning us and the target CPU. */ |
| 6940 | rcu_read_lock(); | 7037 | rcu_read_lock(); |
| 6941 | for_each_domain(target_cpu, sd) { | 7038 | for_each_domain(target_cpu, sd) { |
| @@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6956 | 7053 | ||
| 6957 | schedstat_inc(sd, alb_count); | 7054 | schedstat_inc(sd, alb_count); |
| 6958 | 7055 | ||
| 6959 | if (move_one_task(&env)) | 7056 | p = detach_one_task(&env); |
| 7057 | if (p) | ||
| 6960 | schedstat_inc(sd, alb_pushed); | 7058 | schedstat_inc(sd, alb_pushed); |
| 6961 | else | 7059 | else |
| 6962 | schedstat_inc(sd, alb_failed); | 7060 | schedstat_inc(sd, alb_failed); |
| 6963 | } | 7061 | } |
| 6964 | rcu_read_unlock(); | 7062 | rcu_read_unlock(); |
| 6965 | double_unlock_balance(busiest_rq, target_rq); | ||
| 6966 | out_unlock: | 7063 | out_unlock: |
| 6967 | busiest_rq->active_balance = 0; | 7064 | busiest_rq->active_balance = 0; |
| 6968 | raw_spin_unlock_irq(&busiest_rq->lock); | 7065 | raw_spin_unlock(&busiest_rq->lock); |
| 7066 | |||
| 7067 | if (p) | ||
| 7068 | attach_one_task(target_rq, p); | ||
| 7069 | |||
| 7070 | local_irq_enable(); | ||
| 7071 | |||
| 6969 | return 0; | 7072 | return 0; |
| 6970 | } | 7073 | } |
| 6971 | 7074 | ||
| @@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p) | |||
| 7465 | static void | 7568 | static void |
| 7466 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | 7569 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
| 7467 | { | 7570 | { |
| 7468 | if (!p->se.on_rq) | 7571 | if (!task_on_rq_queued(p)) |
| 7469 | return; | 7572 | return; |
| 7470 | 7573 | ||
| 7471 | /* | 7574 | /* |
| @@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7490 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7593 | * switched back to the fair class the enqueue_entity(.flags=0) will |
| 7491 | * do the right thing. | 7594 | * do the right thing. |
| 7492 | * | 7595 | * |
| 7493 | * If it's on_rq, then the dequeue_entity(.flags=0) will already | 7596 | * If it's queued, then the dequeue_entity(.flags=0) will already |
| 7494 | * have normalized the vruntime, if it's !on_rq, then only when | 7597 | * have normalized the vruntime, if it's !queued, then only when |
| 7495 | * the task is sleeping will it still have non-normalized vruntime. | 7598 | * the task is sleeping will it still have non-normalized vruntime. |
| 7496 | */ | 7599 | */ |
| 7497 | if (!p->on_rq && p->state != TASK_RUNNING) { | 7600 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { |
| 7498 | /* | 7601 | /* |
| 7499 | * Fix up our vruntime so that the current sleep doesn't | 7602 | * Fix up our vruntime so that the current sleep doesn't |
| 7500 | * cause 'unlimited' sleep bonus. | 7603 | * cause 'unlimited' sleep bonus. |
| @@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7521 | */ | 7624 | */ |
| 7522 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7625 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
| 7523 | { | 7626 | { |
| 7524 | struct sched_entity *se = &p->se; | ||
| 7525 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7627 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7628 | struct sched_entity *se = &p->se; | ||
| 7526 | /* | 7629 | /* |
| 7527 | * Since the real-depth could have been changed (only FAIR | 7630 | * Since the real-depth could have been changed (only FAIR |
| 7528 | * class maintain depth value), reset depth properly. | 7631 | * class maintain depth value), reset depth properly. |
| 7529 | */ | 7632 | */ |
| 7530 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7633 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7531 | #endif | 7634 | #endif |
| 7532 | if (!se->on_rq) | 7635 | if (!task_on_rq_queued(p)) |
| 7533 | return; | 7636 | return; |
| 7534 | 7637 | ||
| 7535 | /* | 7638 | /* |
| @@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 7575 | } | 7678 | } |
| 7576 | 7679 | ||
| 7577 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7680 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7578 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7681 | static void task_move_group_fair(struct task_struct *p, int queued) |
| 7579 | { | 7682 | { |
| 7580 | struct sched_entity *se = &p->se; | 7683 | struct sched_entity *se = &p->se; |
| 7581 | struct cfs_rq *cfs_rq; | 7684 | struct cfs_rq *cfs_rq; |
| @@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7594 | * fair sleeper stuff for the first placement, but who cares. | 7697 | * fair sleeper stuff for the first placement, but who cares. |
| 7595 | */ | 7698 | */ |
| 7596 | /* | 7699 | /* |
| 7597 | * When !on_rq, vruntime of the task has usually NOT been normalized. | 7700 | * When !queued, vruntime of the task has usually NOT been normalized. |
| 7598 | * But there are some cases where it has already been normalized: | 7701 | * But there are some cases where it has already been normalized: |
| 7599 | * | 7702 | * |
| 7600 | * - Moving a forked child which is waiting for being woken up by | 7703 | * - Moving a forked child which is waiting for being woken up by |
| @@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7605 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7708 | * To prevent boost or penalty in the new cfs_rq caused by delta |
| 7606 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7709 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
| 7607 | */ | 7710 | */ |
| 7608 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | 7711 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
| 7609 | on_rq = 1; | 7712 | queued = 1; |
| 7610 | 7713 | ||
| 7611 | if (!on_rq) | 7714 | if (!queued) |
| 7612 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | 7715 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
| 7613 | set_task_rq(p, task_cpu(p)); | 7716 | set_task_rq(p, task_cpu(p)); |
| 7614 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7717 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7615 | if (!on_rq) { | 7718 | if (!queued) { |
| 7616 | cfs_rq = cfs_rq_of(se); | 7719 | cfs_rq = cfs_rq_of(se); |
| 7617 | se->vruntime += cfs_rq->min_vruntime; | 7720 | se->vruntime += cfs_rq->min_vruntime; |
| 7618 | #ifdef CONFIG_SMP | 7721 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -147,6 +147,9 @@ use_default: | |||
| 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
| 148 | goto use_default; | 148 | goto use_default; |
| 149 | 149 | ||
| 150 | /* Take note of the planned idle state. */ | ||
| 151 | idle_set_state(this_rq(), &drv->states[next_state]); | ||
| 152 | |||
| 150 | /* | 153 | /* |
| 151 | * Enter the idle state previously returned by the governor decision. | 154 | * Enter the idle state previously returned by the governor decision. |
| 152 | * This function will block until an interrupt occurs and will take | 155 | * This function will block until an interrupt occurs and will take |
| @@ -154,6 +157,9 @@ use_default: | |||
| 154 | */ | 157 | */ |
| 155 | entered_state = cpuidle_enter(drv, dev, next_state); | 158 | entered_state = cpuidle_enter(drv, dev, next_state); |
| 156 | 159 | ||
| 160 | /* The cpu is no longer idle or about to enter idle. */ | ||
| 161 | idle_set_state(this_rq(), NULL); | ||
| 162 | |||
| 157 | if (broadcast) | 163 | if (broadcast) |
| 158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 164 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
| 159 | 165 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..87ea5bf1b87f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1448 | * means a dl or stop task can slip in, in which case we need | 1448 | * means a dl or stop task can slip in, in which case we need |
| 1449 | * to re-start task selection. | 1449 | * to re-start task selection. |
| 1450 | */ | 1450 | */ |
| 1451 | if (unlikely((rq->stop && rq->stop->on_rq) || | 1451 | if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || |
| 1452 | rq->dl.dl_nr_running)) | 1452 | rq->dl.dl_nr_running)) |
| 1453 | return RETRY_TASK; | 1453 | return RETRY_TASK; |
| 1454 | } | 1454 | } |
| @@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1468 | p = _pick_next_task_rt(rq); | 1468 | p = _pick_next_task_rt(rq); |
| 1469 | 1469 | ||
| 1470 | /* The running task is never eligible for pushing */ | 1470 | /* The running task is never eligible for pushing */ |
| 1471 | if (p) | 1471 | dequeue_pushable_task(rq, p); |
| 1472 | dequeue_pushable_task(rq, p); | ||
| 1473 | 1472 | ||
| 1474 | set_post_schedule(rq); | 1473 | set_post_schedule(rq); |
| 1475 | 1474 | ||
| @@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1624 | !cpumask_test_cpu(lowest_rq->cpu, | 1623 | !cpumask_test_cpu(lowest_rq->cpu, |
| 1625 | tsk_cpus_allowed(task)) || | 1624 | tsk_cpus_allowed(task)) || |
| 1626 | task_running(rq, task) || | 1625 | task_running(rq, task) || |
| 1627 | !task->on_rq)) { | 1626 | !task_on_rq_queued(task))) { |
| 1628 | 1627 | ||
| 1629 | double_unlock_balance(rq, lowest_rq); | 1628 | double_unlock_balance(rq, lowest_rq); |
| 1630 | lowest_rq = NULL; | 1629 | lowest_rq = NULL; |
| @@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1658 | BUG_ON(task_current(rq, p)); | 1657 | BUG_ON(task_current(rq, p)); |
| 1659 | BUG_ON(p->nr_cpus_allowed <= 1); | 1658 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1660 | 1659 | ||
| 1661 | BUG_ON(!p->on_rq); | 1660 | BUG_ON(!task_on_rq_queued(p)); |
| 1662 | BUG_ON(!rt_task(p)); | 1661 | BUG_ON(!rt_task(p)); |
| 1663 | 1662 | ||
| 1664 | return p; | 1663 | return p; |
| @@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1809 | */ | 1808 | */ |
| 1810 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1809 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
| 1811 | WARN_ON(p == src_rq->curr); | 1810 | WARN_ON(p == src_rq->curr); |
| 1812 | WARN_ON(!p->on_rq); | 1811 | WARN_ON(!task_on_rq_queued(p)); |
| 1813 | 1812 | ||
| 1814 | /* | 1813 | /* |
| 1815 | * There's a chance that p is higher in priority | 1814 | * There's a chance that p is higher in priority |
| @@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1870 | 1869 | ||
| 1871 | BUG_ON(!rt_task(p)); | 1870 | BUG_ON(!rt_task(p)); |
| 1872 | 1871 | ||
| 1873 | if (!p->on_rq) | 1872 | if (!task_on_rq_queued(p)) |
| 1874 | return; | 1873 | return; |
| 1875 | 1874 | ||
| 1876 | weight = cpumask_weight(new_mask); | 1875 | weight = cpumask_weight(new_mask); |
| @@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1936 | * we may need to handle the pulling of RT tasks | 1935 | * we may need to handle the pulling of RT tasks |
| 1937 | * now. | 1936 | * now. |
| 1938 | */ | 1937 | */ |
| 1939 | if (!p->on_rq || rq->rt.rt_nr_running) | 1938 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
| 1940 | return; | 1939 | return; |
| 1941 | 1940 | ||
| 1942 | if (pull_rt_task(rq)) | 1941 | if (pull_rt_task(rq)) |
| @@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1970 | * If that current running task is also an RT task | 1969 | * If that current running task is also an RT task |
| 1971 | * then see if we can move to another run queue. | 1970 | * then see if we can move to another run queue. |
| 1972 | */ | 1971 | */ |
| 1973 | if (p->on_rq && rq->curr != p) { | 1972 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1974 | #ifdef CONFIG_SMP | 1973 | #ifdef CONFIG_SMP |
| 1975 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && | 1974 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
| 1976 | /* Don't resched if we changed runqueues */ | 1975 | /* Don't resched if we changed runqueues */ |
| @@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1989 | static void | 1988 | static void |
| 1990 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1989 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
| 1991 | { | 1990 | { |
| 1992 | if (!p->on_rq) | 1991 | if (!task_on_rq_queued(p)) |
| 1993 | return; | 1992 | return; |
| 1994 | 1993 | ||
| 1995 | if (rq->curr == p) { | 1994 | if (rq->curr == p) { |
| @@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 2073 | for_each_sched_rt_entity(rt_se) { | 2072 | for_each_sched_rt_entity(rt_se) { |
| 2074 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 2073 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
| 2075 | requeue_task_rt(rq, p, 0); | 2074 | requeue_task_rt(rq, p, 0); |
| 2076 | set_tsk_need_resched(p); | 2075 | resched_curr(rq); |
| 2077 | return; | 2076 | return; |
| 2078 | } | 2077 | } |
| 2079 | } | 2078 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..6130251de280 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -14,6 +14,11 @@ | |||
| 14 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
| 15 | 15 | ||
| 16 | struct rq; | 16 | struct rq; |
| 17 | struct cpuidle_state; | ||
| 18 | |||
| 19 | /* task_struct::on_rq states: */ | ||
| 20 | #define TASK_ON_RQ_QUEUED 1 | ||
| 21 | #define TASK_ON_RQ_MIGRATING 2 | ||
| 17 | 22 | ||
| 18 | extern __read_mostly int scheduler_running; | 23 | extern __read_mostly int scheduler_running; |
| 19 | 24 | ||
| @@ -126,6 +131,9 @@ struct rt_bandwidth { | |||
| 126 | u64 rt_runtime; | 131 | u64 rt_runtime; |
| 127 | struct hrtimer rt_period_timer; | 132 | struct hrtimer rt_period_timer; |
| 128 | }; | 133 | }; |
| 134 | |||
| 135 | void __dl_clear_params(struct task_struct *p); | ||
| 136 | |||
| 129 | /* | 137 | /* |
| 130 | * To keep the bandwidth of -deadline tasks and groups under control | 138 | * To keep the bandwidth of -deadline tasks and groups under control |
| 131 | * we need some place where: | 139 | * we need some place where: |
| @@ -184,7 +192,7 @@ struct cfs_bandwidth { | |||
| 184 | raw_spinlock_t lock; | 192 | raw_spinlock_t lock; |
| 185 | ktime_t period; | 193 | ktime_t period; |
| 186 | u64 quota, runtime; | 194 | u64 quota, runtime; |
| 187 | s64 hierarchal_quota; | 195 | s64 hierarchical_quota; |
| 188 | u64 runtime_expires; | 196 | u64 runtime_expires; |
| 189 | 197 | ||
| 190 | int idle, timer_active; | 198 | int idle, timer_active; |
| @@ -636,6 +644,11 @@ struct rq { | |||
| 636 | #ifdef CONFIG_SMP | 644 | #ifdef CONFIG_SMP |
| 637 | struct llist_head wake_list; | 645 | struct llist_head wake_list; |
| 638 | #endif | 646 | #endif |
| 647 | |||
| 648 | #ifdef CONFIG_CPU_IDLE | ||
| 649 | /* Must be inspected within a rcu lock section */ | ||
| 650 | struct cpuidle_state *idle_state; | ||
| 651 | #endif | ||
| 639 | }; | 652 | }; |
| 640 | 653 | ||
| 641 | static inline int cpu_of(struct rq *rq) | 654 | static inline int cpu_of(struct rq *rq) |
| @@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 647 | #endif | 660 | #endif |
| 648 | } | 661 | } |
| 649 | 662 | ||
| 650 | DECLARE_PER_CPU(struct rq, runqueues); | 663 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 651 | 664 | ||
| 652 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 665 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 653 | #define this_rq() (&__get_cpu_var(runqueues)) | 666 | #define this_rq() (&__get_cpu_var(runqueues)) |
| @@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 942 | #endif | 955 | #endif |
| 943 | } | 956 | } |
| 944 | 957 | ||
| 958 | static inline int task_on_rq_queued(struct task_struct *p) | ||
| 959 | { | ||
| 960 | return p->on_rq == TASK_ON_RQ_QUEUED; | ||
| 961 | } | ||
| 962 | |||
| 963 | static inline int task_on_rq_migrating(struct task_struct *p) | ||
| 964 | { | ||
| 965 | return p->on_rq == TASK_ON_RQ_MIGRATING; | ||
| 966 | } | ||
| 945 | 967 | ||
| 946 | #ifndef prepare_arch_switch | 968 | #ifndef prepare_arch_switch |
| 947 | # define prepare_arch_switch(next) do { } while (0) | 969 | # define prepare_arch_switch(next) do { } while (0) |
| @@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 953 | # define finish_arch_post_lock_switch() do { } while (0) | 975 | # define finish_arch_post_lock_switch() do { } while (0) |
| 954 | #endif | 976 | #endif |
| 955 | 977 | ||
| 956 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 957 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 978 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 958 | { | 979 | { |
| 959 | #ifdef CONFIG_SMP | 980 | #ifdef CONFIG_SMP |
| @@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 991 | raw_spin_unlock_irq(&rq->lock); | 1012 | raw_spin_unlock_irq(&rq->lock); |
| 992 | } | 1013 | } |
| 993 | 1014 | ||
| 994 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 995 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 996 | { | ||
| 997 | #ifdef CONFIG_SMP | ||
| 998 | /* | ||
| 999 | * We can optimise this out completely for !SMP, because the | ||
| 1000 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 1001 | * here. | ||
| 1002 | */ | ||
| 1003 | next->on_cpu = 1; | ||
| 1004 | #endif | ||
| 1005 | raw_spin_unlock(&rq->lock); | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 1009 | { | ||
| 1010 | #ifdef CONFIG_SMP | ||
| 1011 | /* | ||
| 1012 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 1013 | * We must ensure this doesn't happen until the switch is completely | ||
| 1014 | * finished. | ||
| 1015 | */ | ||
| 1016 | smp_wmb(); | ||
| 1017 | prev->on_cpu = 0; | ||
| 1018 | #endif | ||
| 1019 | local_irq_enable(); | ||
| 1020 | } | ||
| 1021 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 1022 | |||
| 1023 | /* | 1015 | /* |
| 1024 | * wake flags | 1016 | * wake flags |
| 1025 | */ | 1017 | */ |
| @@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { } | |||
| 1180 | 1172 | ||
| 1181 | #endif | 1173 | #endif |
| 1182 | 1174 | ||
| 1175 | #ifdef CONFIG_CPU_IDLE | ||
| 1176 | static inline void idle_set_state(struct rq *rq, | ||
| 1177 | struct cpuidle_state *idle_state) | ||
| 1178 | { | ||
| 1179 | rq->idle_state = idle_state; | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1183 | { | ||
| 1184 | WARN_ON(!rcu_read_lock_held()); | ||
| 1185 | return rq->idle_state; | ||
| 1186 | } | ||
| 1187 | #else | ||
| 1188 | static inline void idle_set_state(struct rq *rq, | ||
| 1189 | struct cpuidle_state *idle_state) | ||
| 1190 | { | ||
| 1191 | } | ||
| 1192 | |||
| 1193 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1194 | { | ||
| 1195 | return NULL; | ||
| 1196 | } | ||
| 1197 | #endif | ||
| 1198 | |||
| 1183 | extern void sysrq_sched_debug_show(void); | 1199 | extern void sysrq_sched_debug_show(void); |
| 1184 | extern void sched_init_granularity(void); | 1200 | extern void sched_init_granularity(void); |
| 1185 | extern void update_max_interval(void); | 1201 | extern void update_max_interval(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..67426e529f59 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
| 28 | { | 28 | { |
| 29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
| 30 | 30 | ||
| 31 | if (!stop || !stop->on_rq) | 31 | if (!stop || !task_on_rq_queued(stop)) |
| 32 | return NULL; | 32 | return NULL; |
| 33 | 33 | ||
| 34 | put_prev_task(rq, prev); | 34 | put_prev_task(rq, prev); |
diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..9e0d0b289118 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
| 14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/sched.h> | ||
| 16 | 17 | ||
| 17 | #include "smpboot.h" | 18 | #include "smpboot.h" |
| 18 | 19 | ||
| @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) | |||
| 699 | smp_call_function(do_nothing, NULL, 1); | 700 | smp_call_function(do_nothing, NULL, 1); |
| 700 | } | 701 | } |
| 701 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | 702 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); |
| 703 | |||
| 704 | /** | ||
| 705 | * wake_up_all_idle_cpus - break all cpus out of idle | ||
| 706 | * wake_up_all_idle_cpus try to break all cpus which is in idle state even | ||
| 707 | * including idle polling cpus, for non-idle cpus, we will do nothing | ||
| 708 | * for them. | ||
| 709 | */ | ||
| 710 | void wake_up_all_idle_cpus(void) | ||
| 711 | { | ||
| 712 | int cpu; | ||
| 713 | |||
| 714 | preempt_disable(); | ||
| 715 | for_each_online_cpu(cpu) { | ||
| 716 | if (cpu == smp_processor_id()) | ||
| 717 | continue; | ||
| 718 | |||
| 719 | wake_up_if_idle(cpu); | ||
| 720 | } | ||
| 721 | preempt_enable(); | ||
| 722 | } | ||
| 723 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index dfce4debd138..1eaa2f0b0246 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms) | |||
| 869 | { | 869 | { |
| 870 | cputime_t tgutime, tgstime, cutime, cstime; | 870 | cputime_t tgutime, tgstime, cutime, cstime; |
| 871 | 871 | ||
| 872 | spin_lock_irq(¤t->sighand->siglock); | ||
| 873 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 872 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
| 874 | cutime = current->signal->cutime; | 873 | cutime = current->signal->cutime; |
| 875 | cstime = current->signal->cstime; | 874 | cstime = current->signal->cstime; |
| 876 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 877 | tms->tms_utime = cputime_to_clock_t(tgutime); | 875 | tms->tms_utime = cputime_to_clock_t(tgutime); |
| 878 | tms->tms_stime = cputime_to_clock_t(tgstime); | 876 | tms->tms_stime = cputime_to_clock_t(tgstime); |
| 879 | tms->tms_cutime = cputime_to_clock_t(cutime); | 877 | tms->tms_cutime = cputime_to_clock_t(cutime); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..ab370ffffd53 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
| 1776 | */ | 1776 | */ |
| 1777 | if (!expires) { | 1777 | if (!expires) { |
| 1778 | schedule(); | 1778 | schedule(); |
| 1779 | __set_current_state(TASK_RUNNING); | ||
| 1780 | return -EINTR; | 1779 | return -EINTR; |
| 1781 | } | 1780 | } |
| 1782 | 1781 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
| @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
| 272 | if (same_thread_group(tsk, current)) | 272 | if (same_thread_group(tsk, current)) |
| 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); |
| 274 | } else { | 274 | } else { |
| 275 | unsigned long flags; | ||
| 276 | struct sighand_struct *sighand; | ||
| 277 | |||
| 278 | /* | ||
| 279 | * while_each_thread() is not yet entirely RCU safe, | ||
| 280 | * keep locking the group while sampling process | ||
| 281 | * clock for now. | ||
| 282 | */ | ||
| 283 | sighand = lock_task_sighand(tsk, &flags); | ||
| 284 | if (!sighand) | ||
| 285 | return err; | ||
| 286 | |||
| 287 | if (tsk == current || thread_group_leader(tsk)) | 275 | if (tsk == current || thread_group_leader(tsk)) |
| 288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | 276 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); |
| 289 | |||
| 290 | unlock_task_sighand(tsk, &flags); | ||
| 291 | } | 277 | } |
| 292 | 278 | ||
| 293 | if (!err) | 279 | if (!err) |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) | |||
| 205 | break; | 205 | break; |
| 206 | 206 | ||
| 207 | schedule(); | 207 | schedule(); |
| 208 | __set_current_state(TASK_RUNNING); | ||
| 209 | } | 208 | } |
| 210 | reader_finish = 0; | 209 | reader_finish = 0; |
| 211 | complete(&read_done); | 210 | complete(&read_done); |
| @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) | |||
| 379 | break; | 378 | break; |
| 380 | 379 | ||
| 381 | schedule(); | 380 | schedule(); |
| 382 | __set_current_state(TASK_RUNNING); | ||
| 383 | } | 381 | } |
| 384 | __set_current_state(TASK_RUNNING); | 382 | __set_current_state(TASK_RUNNING); |
| 385 | 383 | ||
| @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) | |||
| 407 | trace_printk("Sleeping for 10 secs\n"); | 405 | trace_printk("Sleeping for 10 secs\n"); |
| 408 | set_current_state(TASK_INTERRUPTIBLE); | 406 | set_current_state(TASK_INTERRUPTIBLE); |
| 409 | schedule_timeout(HZ * SLEEP_TIME); | 407 | schedule_timeout(HZ * SLEEP_TIME); |
| 410 | __set_current_state(TASK_RUNNING); | ||
| 411 | } | 408 | } |
| 412 | 409 | ||
| 413 | if (kill_test) | 410 | if (kill_test) |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/sysctl.h> | 13 | #include <linux/sysctl.h> |
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
| 16 | #include <linux/magic.h> | ||
| 17 | 16 | ||
| 18 | #include <asm/setup.h> | 17 | #include <asm/setup.h> |
| 19 | 18 | ||
| @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
| 171 | i++; | 170 | i++; |
| 172 | } | 171 | } |
| 173 | 172 | ||
| 174 | if ((current != &init_task && | 173 | if (task_stack_end_corrupted(current)) { |
| 175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { | ||
| 176 | print_max_stack(); | 174 | print_max_stack(); |
| 177 | BUG(); | 175 | BUG(); |
| 178 | } | 176 | } |
