aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/auto_group.c36
-rw-r--r--kernel/sched/core.c362
-rw-r--r--kernel/sched/cpudeadline.c153
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c122
-rw-r--r--kernel/sched/cputime.c87
-rw-r--r--kernel/sched/deadline.c83
-rw-r--r--kernel/sched/debug.c106
-rw-r--r--kernel/sched/fair.c794
-rw-r--r--kernel/sched/idle.c13
-rw-r--r--kernel/sched/idle_task.c4
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h136
-rw-r--r--kernel/sched/stats.h24
-rw-r--r--kernel/sched/wait.c123
16 files changed, 1286 insertions, 767 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
111{ 111{
112 if (tg != &root_task_group) 112 if (tg != &root_task_group)
113 return false; 113 return false;
114
115 /* 114 /*
116 * We can only assume the task group can't go away on us if 115 * If we race with autogroup_move_group() the caller can use the old
117 * autogroup_move_group() can see us on ->thread_group list. 116 * value of signal->autogroup but in this case sched_move_task() will
117 * be called again before autogroup_kref_put().
118 *
119 * However, there is no way sched_autogroup_exit_task() could tell us
120 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
118 */ 121 */
119 if (p->flags & PF_EXITING) 122 if (p->flags & PF_EXITING)
120 return false; 123 return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
122 return true; 125 return true;
123} 126}
124 127
128void sched_autogroup_exit_task(struct task_struct *p)
129{
130 /*
131 * We are going to call exit_notify() and autogroup_move_group() can't
132 * see this thread after that: we can no longer use signal->autogroup.
133 * See the PF_EXITING check in task_wants_autogroup().
134 */
135 sched_move_task(p);
136}
137
125static void 138static void
126autogroup_move_group(struct task_struct *p, struct autogroup *ag) 139autogroup_move_group(struct task_struct *p, struct autogroup *ag)
127{ 140{
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
138 } 151 }
139 152
140 p->signal->autogroup = autogroup_kref_get(ag); 153 p->signal->autogroup = autogroup_kref_get(ag);
141 154 /*
142 if (!READ_ONCE(sysctl_sched_autogroup_enabled)) 155 * We can't avoid sched_move_task() after we changed signal->autogroup,
143 goto out; 156 * this process can already run with task_group() == prev->tg or we can
144 157 * race with cgroup code which can read autogroup = prev under rq->lock.
158 * In the latter case for_each_thread() can not miss a migrating thread,
159 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
160 * can't be removed from thread list, we hold ->siglock.
161 *
162 * If an exiting thread was already removed from thread list we rely on
163 * sched_autogroup_exit_task().
164 */
145 for_each_thread(p, t) 165 for_each_thread(p, t)
146 sched_move_task(t); 166 sched_move_task(t);
147out: 167
148 unlock_task_sighand(p, &flags); 168 unlock_task_sighand(p, &flags);
149 autogroup_kref_put(prev); 169 autogroup_kref_put(prev);
150} 170}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44817c640e99..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
581 * If needed we can still optimize that later with an 581 * If needed we can still optimize that later with an
582 * empty IRQ. 582 * empty IRQ.
583 */ 583 */
584 if (cpu_is_offline(cpu))
585 return true; /* Don't try to wake offline CPUs. */
584 if (tick_nohz_full_cpu(cpu)) { 586 if (tick_nohz_full_cpu(cpu)) {
585 if (cpu != smp_processor_id() || 587 if (cpu != smp_processor_id() ||
586 tick_nohz_tick_stopped()) 588 tick_nohz_tick_stopped())
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
591 return false; 593 return false;
592} 594}
593 595
596/*
597 * Wake up the specified CPU. If the CPU is going offline, it is the
598 * caller's responsibility to deal with the lost wakeup, for example,
599 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
600 */
594void wake_up_nohz_cpu(int cpu) 601void wake_up_nohz_cpu(int cpu)
595{ 602{
596 if (!wake_up_full_nohz_cpu(cpu)) 603 if (!wake_up_full_nohz_cpu(cpu))
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data)
1063 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1070 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1064 * we're holding p->pi_lock. 1071 * we're holding p->pi_lock.
1065 */ 1072 */
1066 if (task_rq(p) == rq && task_on_rq_queued(p)) 1073 if (task_rq(p) == rq) {
1067 rq = __migrate_task(rq, p, arg->dest_cpu); 1074 if (task_on_rq_queued(p))
1075 rq = __migrate_task(rq, p, arg->dest_cpu);
1076 else
1077 p->wake_cpu = arg->dest_cpu;
1078 }
1068 raw_spin_unlock(&rq->lock); 1079 raw_spin_unlock(&rq->lock);
1069 raw_spin_unlock(&p->pi_lock); 1080 raw_spin_unlock(&p->pi_lock);
1070 1081
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1105 1116
1106 p->sched_class->set_cpus_allowed(p, new_mask); 1117 p->sched_class->set_cpus_allowed(p, new_mask);
1107 1118
1108 if (running)
1109 p->sched_class->set_curr_task(rq);
1110 if (queued) 1119 if (queued)
1111 enqueue_task(rq, p, ENQUEUE_RESTORE); 1120 enqueue_task(rq, p, ENQUEUE_RESTORE);
1121 if (running)
1122 set_curr_task(rq, p);
1112} 1123}
1113 1124
1114/* 1125/*
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
1265 /* 1276 /*
1266 * Task isn't running anymore; make it appear like we migrated 1277 * Task isn't running anymore; make it appear like we migrated
1267 * it before it went to sleep. This means on wakeup we make the 1278 * it before it went to sleep. This means on wakeup we make the
1268 * previous cpu our targer instead of where it really is. 1279 * previous cpu our target instead of where it really is.
1269 */ 1280 */
1270 p->wake_cpu = cpu; 1281 p->wake_cpu = cpu;
1271 } 1282 }
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1629static void 1640static void
1630ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1641ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1631{ 1642{
1632#ifdef CONFIG_SCHEDSTATS 1643 struct rq *rq;
1633 struct rq *rq = this_rq();
1634 1644
1635#ifdef CONFIG_SMP 1645 if (!schedstat_enabled())
1636 int this_cpu = smp_processor_id(); 1646 return;
1637 1647
1638 if (cpu == this_cpu) { 1648 rq = this_rq();
1639 schedstat_inc(rq, ttwu_local); 1649
1640 schedstat_inc(p, se.statistics.nr_wakeups_local); 1650#ifdef CONFIG_SMP
1651 if (cpu == rq->cpu) {
1652 schedstat_inc(rq->ttwu_local);
1653 schedstat_inc(p->se.statistics.nr_wakeups_local);
1641 } else { 1654 } else {
1642 struct sched_domain *sd; 1655 struct sched_domain *sd;
1643 1656
1644 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1657 schedstat_inc(p->se.statistics.nr_wakeups_remote);
1645 rcu_read_lock(); 1658 rcu_read_lock();
1646 for_each_domain(this_cpu, sd) { 1659 for_each_domain(rq->cpu, sd) {
1647 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1660 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1648 schedstat_inc(sd, ttwu_wake_remote); 1661 schedstat_inc(sd->ttwu_wake_remote);
1649 break; 1662 break;
1650 } 1663 }
1651 } 1664 }
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1653 } 1666 }
1654 1667
1655 if (wake_flags & WF_MIGRATED) 1668 if (wake_flags & WF_MIGRATED)
1656 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1669 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1657
1658#endif /* CONFIG_SMP */ 1670#endif /* CONFIG_SMP */
1659 1671
1660 schedstat_inc(rq, ttwu_count); 1672 schedstat_inc(rq->ttwu_count);
1661 schedstat_inc(p, se.statistics.nr_wakeups); 1673 schedstat_inc(p->se.statistics.nr_wakeups);
1662 1674
1663 if (wake_flags & WF_SYNC) 1675 if (wake_flags & WF_SYNC)
1664 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1676 schedstat_inc(p->se.statistics.nr_wakeups_sync);
1665
1666#endif /* CONFIG_SCHEDSTATS */
1667} 1677}
1668 1678
1669static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1679static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2084 2094
2085 ttwu_queue(p, cpu, wake_flags); 2095 ttwu_queue(p, cpu, wake_flags);
2086stat: 2096stat:
2087 if (schedstat_enabled()) 2097 ttwu_stat(p, cpu, wake_flags);
2088 ttwu_stat(p, cpu, wake_flags);
2089out: 2098out:
2090 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2099 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2091 2100
@@ -2095,6 +2104,7 @@ out:
2095/** 2104/**
2096 * try_to_wake_up_local - try to wake up a local task with rq lock held 2105 * try_to_wake_up_local - try to wake up a local task with rq lock held
2097 * @p: the thread to be awakened 2106 * @p: the thread to be awakened
2107 * @cookie: context's cookie for pinning
2098 * 2108 *
2099 * Put @p on the run-queue if it's not already there. The caller must 2109 * Put @p on the run-queue if it's not already there. The caller must
2100 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2110 * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
2133 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2143 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2134 2144
2135 ttwu_do_wakeup(rq, p, 0, cookie); 2145 ttwu_do_wakeup(rq, p, 0, cookie);
2136 if (schedstat_enabled()) 2146 ttwu_stat(p, smp_processor_id(), 0);
2137 ttwu_stat(p, smp_processor_id(), 0);
2138out: 2147out:
2139 raw_spin_unlock(&p->pi_lock); 2148 raw_spin_unlock(&p->pi_lock);
2140} 2149}
@@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2772 * task and put them back on the free list. 2781 * task and put them back on the free list.
2773 */ 2782 */
2774 kprobe_flush_task(prev); 2783 kprobe_flush_task(prev);
2784
2785 /* Task is done with its stack. */
2786 put_task_stack(prev);
2787
2775 put_task_struct(prev); 2788 put_task_struct(prev);
2776 } 2789 }
2777 2790
@@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }
3192 */ 3205 */
3193static noinline void __schedule_bug(struct task_struct *prev) 3206static noinline void __schedule_bug(struct task_struct *prev)
3194{ 3207{
3208 /* Save this before calling printk(), since that will clobber it */
3209 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3210
3195 if (oops_in_progress) 3211 if (oops_in_progress)
3196 return; 3212 return;
3197 3213
@@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
3202 print_modules(); 3218 print_modules();
3203 if (irqs_disabled()) 3219 if (irqs_disabled())
3204 print_irqtrace_events(prev); 3220 print_irqtrace_events(prev);
3205#ifdef CONFIG_DEBUG_PREEMPT 3221 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3206 if (in_atomic_preempt_off()) { 3222 && in_atomic_preempt_off()) {
3207 pr_err("Preemption disabled at:"); 3223 pr_err("Preemption disabled at:");
3208 print_ip_sym(current->preempt_disable_ip); 3224 print_ip_sym(preempt_disable_ip);
3209 pr_cont("\n"); 3225 pr_cont("\n");
3210 } 3226 }
3211#endif
3212 if (panic_on_warn) 3227 if (panic_on_warn)
3213 panic("scheduling while atomic\n"); 3228 panic("scheduling while atomic\n");
3214 3229
@@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)
3234 3249
3235 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3250 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3236 3251
3237 schedstat_inc(this_rq(), sched_count); 3252 schedstat_inc(this_rq()->sched_count);
3238} 3253}
3239 3254
3240/* 3255/*
@@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)
3327 rq = cpu_rq(cpu); 3342 rq = cpu_rq(cpu);
3328 prev = rq->curr; 3343 prev = rq->curr;
3329 3344
3330 /*
3331 * do_exit() calls schedule() with preemption disabled as an exception;
3332 * however we must fix that up, otherwise the next task will see an
3333 * inconsistent (higher) preempt count.
3334 *
3335 * It also avoids the below schedule_debug() test from complaining
3336 * about this.
3337 */
3338 if (unlikely(prev->state == TASK_DEAD))
3339 preempt_enable_no_resched_notrace();
3340
3341 schedule_debug(prev); 3345 schedule_debug(prev);
3342 3346
3343 if (sched_feat(HRTICK)) 3347 if (sched_feat(HRTICK))
@@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)
3403 3407
3404 balance_callback(rq); 3408 balance_callback(rq);
3405} 3409}
3406STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ 3410
3411void __noreturn do_task_dead(void)
3412{
3413 /*
3414 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
3415 * when the following two conditions become true.
3416 * - There is race condition of mmap_sem (It is acquired by
3417 * exit_mm()), and
3418 * - SMI occurs before setting TASK_RUNINNG.
3419 * (or hypervisor of virtual machine switches to other guest)
3420 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
3421 *
3422 * To avoid it, we have to wait for releasing tsk->pi_lock which
3423 * is held by try_to_wake_up()
3424 */
3425 smp_mb();
3426 raw_spin_unlock_wait(&current->pi_lock);
3427
3428 /* causes final put_task_struct in finish_task_switch(). */
3429 __set_current_state(TASK_DEAD);
3430 current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
3431 __schedule(false);
3432 BUG();
3433 /* Avoid "noreturn function does return". */
3434 for (;;)
3435 cpu_relax(); /* For when BUG is null */
3436}
3407 3437
3408static inline void sched_submit_work(struct task_struct *tsk) 3438static inline void sched_submit_work(struct task_struct *tsk)
3409{ 3439{
@@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3687 3717
3688 p->prio = prio; 3718 p->prio = prio;
3689 3719
3690 if (running)
3691 p->sched_class->set_curr_task(rq);
3692 if (queued) 3720 if (queued)
3693 enqueue_task(rq, p, queue_flag); 3721 enqueue_task(rq, p, queue_flag);
3722 if (running)
3723 set_curr_task(rq, p);
3694 3724
3695 check_class_changed(rq, p, prev_class, oldprio); 3725 check_class_changed(rq, p, prev_class, oldprio);
3696out_unlock: 3726out_unlock:
@@ -3704,7 +3734,8 @@ out_unlock:
3704 3734
3705void set_user_nice(struct task_struct *p, long nice) 3735void set_user_nice(struct task_struct *p, long nice)
3706{ 3736{
3707 int old_prio, delta, queued; 3737 bool queued, running;
3738 int old_prio, delta;
3708 struct rq_flags rf; 3739 struct rq_flags rf;
3709 struct rq *rq; 3740 struct rq *rq;
3710 3741
@@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)
3726 goto out_unlock; 3757 goto out_unlock;
3727 } 3758 }
3728 queued = task_on_rq_queued(p); 3759 queued = task_on_rq_queued(p);
3760 running = task_current(rq, p);
3729 if (queued) 3761 if (queued)
3730 dequeue_task(rq, p, DEQUEUE_SAVE); 3762 dequeue_task(rq, p, DEQUEUE_SAVE);
3763 if (running)
3764 put_prev_task(rq, p);
3731 3765
3732 p->static_prio = NICE_TO_PRIO(nice); 3766 p->static_prio = NICE_TO_PRIO(nice);
3733 set_load_weight(p); 3767 set_load_weight(p);
@@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)
3744 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3778 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3745 resched_curr(rq); 3779 resched_curr(rq);
3746 } 3780 }
3781 if (running)
3782 set_curr_task(rq, p);
3747out_unlock: 3783out_unlock:
3748 task_rq_unlock(rq, p, &rf); 3784 task_rq_unlock(rq, p, &rf);
3749} 3785}
@@ -4243,8 +4279,6 @@ change:
4243 prev_class = p->sched_class; 4279 prev_class = p->sched_class;
4244 __setscheduler(rq, p, attr, pi); 4280 __setscheduler(rq, p, attr, pi);
4245 4281
4246 if (running)
4247 p->sched_class->set_curr_task(rq);
4248 if (queued) { 4282 if (queued) {
4249 /* 4283 /*
4250 * We enqueue to tail when the priority of a task is 4284 * We enqueue to tail when the priority of a task is
@@ -4255,6 +4289,8 @@ change:
4255 4289
4256 enqueue_task(rq, p, queue_flags); 4290 enqueue_task(rq, p, queue_flags);
4257 } 4291 }
4292 if (running)
4293 set_curr_task(rq, p);
4258 4294
4259 check_class_changed(rq, p, prev_class, oldprio); 4295 check_class_changed(rq, p, prev_class, oldprio);
4260 preempt_disable(); /* avoid rq from going away on us */ 4296 preempt_disable(); /* avoid rq from going away on us */
@@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)
4846{ 4882{
4847 struct rq *rq = this_rq_lock(); 4883 struct rq *rq = this_rq_lock();
4848 4884
4849 schedstat_inc(rq, yld_count); 4885 schedstat_inc(rq->yld_count);
4850 current->sched_class->yield_task(rq); 4886 current->sched_class->yield_task(rq);
4851 4887
4852 /* 4888 /*
@@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)
4863 return 0; 4899 return 0;
4864} 4900}
4865 4901
4902#ifndef CONFIG_PREEMPT
4866int __sched _cond_resched(void) 4903int __sched _cond_resched(void)
4867{ 4904{
4868 if (should_resched(0)) { 4905 if (should_resched(0)) {
@@ -4872,6 +4909,7 @@ int __sched _cond_resched(void)
4872 return 0; 4909 return 0;
4873} 4910}
4874EXPORT_SYMBOL(_cond_resched); 4911EXPORT_SYMBOL(_cond_resched);
4912#endif
4875 4913
4876/* 4914/*
4877 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4915 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4997,7 +5035,7 @@ again:
4997 5035
4998 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5036 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4999 if (yielded) { 5037 if (yielded) {
5000 schedstat_inc(rq, yld_count); 5038 schedstat_inc(rq->yld_count);
5001 /* 5039 /*
5002 * Make p's CPU reschedule; pick_next_entity takes care of 5040 * Make p's CPU reschedule; pick_next_entity takes care of
5003 * fairness. 5041 * fairness.
@@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p)
5154 int ppid; 5192 int ppid;
5155 unsigned long state = p->state; 5193 unsigned long state = p->state;
5156 5194
5195 if (!try_get_task_stack(p))
5196 return;
5157 if (state) 5197 if (state)
5158 state = __ffs(state) + 1; 5198 state = __ffs(state) + 1;
5159 printk(KERN_INFO "%-15.15s %c", p->comm, 5199 printk(KERN_INFO "%-15.15s %c", p->comm,
5160 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5200 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5161#if BITS_PER_LONG == 32
5162 if (state == TASK_RUNNING)
5163 printk(KERN_CONT " running ");
5164 else
5165 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5166#else
5167 if (state == TASK_RUNNING) 5201 if (state == TASK_RUNNING)
5168 printk(KERN_CONT " running task "); 5202 printk(KERN_CONT " running task ");
5169 else
5170 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5171#endif
5172#ifdef CONFIG_DEBUG_STACK_USAGE 5203#ifdef CONFIG_DEBUG_STACK_USAGE
5173 free = stack_not_used(p); 5204 free = stack_not_used(p);
5174#endif 5205#endif
@@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p)
5183 5214
5184 print_worker_info(KERN_INFO, p); 5215 print_worker_info(KERN_INFO, p);
5185 show_stack(p, NULL); 5216 show_stack(p, NULL);
5217 put_task_stack(p);
5186} 5218}
5187 5219
5188void show_state_filter(unsigned long state_filter) 5220void show_state_filter(unsigned long state_filter)
@@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid)
5417 5449
5418 p->numa_preferred_nid = nid; 5450 p->numa_preferred_nid = nid;
5419 5451
5420 if (running)
5421 p->sched_class->set_curr_task(rq);
5422 if (queued) 5452 if (queued)
5423 enqueue_task(rq, p, ENQUEUE_RESTORE); 5453 enqueue_task(rq, p, ENQUEUE_RESTORE);
5454 if (running)
5455 set_curr_task(rq, p);
5424 task_rq_unlock(rq, p, &rf); 5456 task_rq_unlock(rq, p, &rf);
5425} 5457}
5426#endif /* CONFIG_NUMA_BALANCING */ 5458#endif /* CONFIG_NUMA_BALANCING */
@@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5717 } 5749 }
5718} 5750}
5719#else /* !CONFIG_SCHED_DEBUG */ 5751#else /* !CONFIG_SCHED_DEBUG */
5752
5753# define sched_debug_enabled 0
5720# define sched_domain_debug(sd, cpu) do { } while (0) 5754# define sched_domain_debug(sd, cpu) do { } while (0)
5721static inline bool sched_debug(void) 5755static inline bool sched_debug(void)
5722{ 5756{
@@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd)
5735 SD_BALANCE_FORK | 5769 SD_BALANCE_FORK |
5736 SD_BALANCE_EXEC | 5770 SD_BALANCE_EXEC |
5737 SD_SHARE_CPUCAPACITY | 5771 SD_SHARE_CPUCAPACITY |
5772 SD_ASYM_CPUCAPACITY |
5738 SD_SHARE_PKG_RESOURCES | 5773 SD_SHARE_PKG_RESOURCES |
5739 SD_SHARE_POWERDOMAIN)) { 5774 SD_SHARE_POWERDOMAIN)) {
5740 if (sd->groups != sd->groups->next) 5775 if (sd->groups != sd->groups->next)
@@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5765 SD_BALANCE_NEWIDLE | 5800 SD_BALANCE_NEWIDLE |
5766 SD_BALANCE_FORK | 5801 SD_BALANCE_FORK |
5767 SD_BALANCE_EXEC | 5802 SD_BALANCE_EXEC |
5803 SD_ASYM_CPUCAPACITY |
5768 SD_SHARE_CPUCAPACITY | 5804 SD_SHARE_CPUCAPACITY |
5769 SD_SHARE_PKG_RESOURCES | 5805 SD_SHARE_PKG_RESOURCES |
5770 SD_PREFER_SIBLING | 5806 SD_PREFER_SIBLING |
@@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
5909 } while (sg != first); 5945 } while (sg != first);
5910} 5946}
5911 5947
5912static void free_sched_domain(struct rcu_head *rcu) 5948static void destroy_sched_domain(struct sched_domain *sd)
5913{ 5949{
5914 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5915
5916 /* 5950 /*
5917 * If its an overlapping domain it has private groups, iterate and 5951 * If its an overlapping domain it has private groups, iterate and
5918 * nuke them all. 5952 * nuke them all.
@@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu)
5923 kfree(sd->groups->sgc); 5957 kfree(sd->groups->sgc);
5924 kfree(sd->groups); 5958 kfree(sd->groups);
5925 } 5959 }
5960 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
5961 kfree(sd->shared);
5926 kfree(sd); 5962 kfree(sd);
5927} 5963}
5928 5964
5929static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5965static void destroy_sched_domains_rcu(struct rcu_head *rcu)
5930{ 5966{
5931 call_rcu(&sd->rcu, free_sched_domain); 5967 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5968
5969 while (sd) {
5970 struct sched_domain *parent = sd->parent;
5971 destroy_sched_domain(sd);
5972 sd = parent;
5973 }
5932} 5974}
5933 5975
5934static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5976static void destroy_sched_domains(struct sched_domain *sd)
5935{ 5977{
5936 for (; sd; sd = sd->parent) 5978 if (sd)
5937 destroy_sched_domain(sd, cpu); 5979 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
5938} 5980}
5939 5981
5940/* 5982/*
@@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5949DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5991DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5950DEFINE_PER_CPU(int, sd_llc_size); 5992DEFINE_PER_CPU(int, sd_llc_size);
5951DEFINE_PER_CPU(int, sd_llc_id); 5993DEFINE_PER_CPU(int, sd_llc_id);
5994DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
5952DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5995DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5953DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5954DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5996DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5955 5997
5956static void update_top_cache_domain(int cpu) 5998static void update_top_cache_domain(int cpu)
5957{ 5999{
6000 struct sched_domain_shared *sds = NULL;
5958 struct sched_domain *sd; 6001 struct sched_domain *sd;
5959 struct sched_domain *busy_sd = NULL;
5960 int id = cpu; 6002 int id = cpu;
5961 int size = 1; 6003 int size = 1;
5962 6004
@@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu)
5964 if (sd) { 6006 if (sd) {
5965 id = cpumask_first(sched_domain_span(sd)); 6007 id = cpumask_first(sched_domain_span(sd));
5966 size = cpumask_weight(sched_domain_span(sd)); 6008 size = cpumask_weight(sched_domain_span(sd));
5967 busy_sd = sd->parent; /* sd_busy */ 6009 sds = sd->shared;
5968 } 6010 }
5969 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5970 6011
5971 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6012 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5972 per_cpu(sd_llc_size, cpu) = size; 6013 per_cpu(sd_llc_size, cpu) = size;
5973 per_cpu(sd_llc_id, cpu) = id; 6014 per_cpu(sd_llc_id, cpu) = id;
6015 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
5974 6016
5975 sd = lowest_flag_domain(cpu, SD_NUMA); 6017 sd = lowest_flag_domain(cpu, SD_NUMA);
5976 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 6018 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6006 */ 6048 */
6007 if (parent->flags & SD_PREFER_SIBLING) 6049 if (parent->flags & SD_PREFER_SIBLING)
6008 tmp->flags |= SD_PREFER_SIBLING; 6050 tmp->flags |= SD_PREFER_SIBLING;
6009 destroy_sched_domain(parent, cpu); 6051 destroy_sched_domain(parent);
6010 } else 6052 } else
6011 tmp = tmp->parent; 6053 tmp = tmp->parent;
6012 } 6054 }
@@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6014 if (sd && sd_degenerate(sd)) { 6056 if (sd && sd_degenerate(sd)) {
6015 tmp = sd; 6057 tmp = sd;
6016 sd = sd->parent; 6058 sd = sd->parent;
6017 destroy_sched_domain(tmp, cpu); 6059 destroy_sched_domain(tmp);
6018 if (sd) 6060 if (sd)
6019 sd->child = NULL; 6061 sd->child = NULL;
6020 } 6062 }
@@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6024 rq_attach_root(rq, rd); 6066 rq_attach_root(rq, rd);
6025 tmp = rq->sd; 6067 tmp = rq->sd;
6026 rcu_assign_pointer(rq->sd, sd); 6068 rcu_assign_pointer(rq->sd, sd);
6027 destroy_sched_domains(tmp, cpu); 6069 destroy_sched_domains(tmp);
6028 6070
6029 update_top_cache_domain(cpu); 6071 update_top_cache_domain(cpu);
6030} 6072}
@@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6267 return; 6309 return;
6268 6310
6269 update_group_capacity(sd, cpu); 6311 update_group_capacity(sd, cpu);
6270 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
6271} 6312}
6272 6313
6273/* 6314/*
@@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6355 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6396 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6356 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6397 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6357 6398
6399 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
6400 *per_cpu_ptr(sdd->sds, cpu) = NULL;
6401
6358 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6402 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6359 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6403 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6360 6404
@@ -6374,26 +6418,37 @@ static int sched_domains_curr_level;
6374/* 6418/*
6375 * SD_flags allowed in topology descriptions. 6419 * SD_flags allowed in topology descriptions.
6376 * 6420 *
6377 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6421 * These flags are purely descriptive of the topology and do not prescribe
6378 * SD_SHARE_PKG_RESOURCES - describes shared caches 6422 * behaviour. Behaviour is artificial and mapped in the below sd_init()
6379 * SD_NUMA - describes NUMA topologies 6423 * function:
6380 * SD_SHARE_POWERDOMAIN - describes shared power domain 6424 *
6425 * SD_SHARE_CPUCAPACITY - describes SMT topologies
6426 * SD_SHARE_PKG_RESOURCES - describes shared caches
6427 * SD_NUMA - describes NUMA topologies
6428 * SD_SHARE_POWERDOMAIN - describes shared power domain
6429 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
6381 * 6430 *
6382 * Odd one out: 6431 * Odd one out, which beside describing the topology has a quirk also
6383 * SD_ASYM_PACKING - describes SMT quirks 6432 * prescribes the desired behaviour that goes along with it:
6433 *
6434 * SD_ASYM_PACKING - describes SMT quirks
6384 */ 6435 */
6385#define TOPOLOGY_SD_FLAGS \ 6436#define TOPOLOGY_SD_FLAGS \
6386 (SD_SHARE_CPUCAPACITY | \ 6437 (SD_SHARE_CPUCAPACITY | \
6387 SD_SHARE_PKG_RESOURCES | \ 6438 SD_SHARE_PKG_RESOURCES | \
6388 SD_NUMA | \ 6439 SD_NUMA | \
6389 SD_ASYM_PACKING | \ 6440 SD_ASYM_PACKING | \
6441 SD_ASYM_CPUCAPACITY | \
6390 SD_SHARE_POWERDOMAIN) 6442 SD_SHARE_POWERDOMAIN)
6391 6443
6392static struct sched_domain * 6444static struct sched_domain *
6393sd_init(struct sched_domain_topology_level *tl, int cpu) 6445sd_init(struct sched_domain_topology_level *tl,
6446 const struct cpumask *cpu_map,
6447 struct sched_domain *child, int cpu)
6394{ 6448{
6395 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6449 struct sd_data *sdd = &tl->data;
6396 int sd_weight, sd_flags = 0; 6450 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6451 int sd_id, sd_weight, sd_flags = 0;
6397 6452
6398#ifdef CONFIG_NUMA 6453#ifdef CONFIG_NUMA
6399 /* 6454 /*
@@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6442 .smt_gain = 0, 6497 .smt_gain = 0,
6443 .max_newidle_lb_cost = 0, 6498 .max_newidle_lb_cost = 0,
6444 .next_decay_max_lb_cost = jiffies, 6499 .next_decay_max_lb_cost = jiffies,
6500 .child = child,
6445#ifdef CONFIG_SCHED_DEBUG 6501#ifdef CONFIG_SCHED_DEBUG
6446 .name = tl->name, 6502 .name = tl->name,
6447#endif 6503#endif
6448 }; 6504 };
6449 6505
6506 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6507 sd_id = cpumask_first(sched_domain_span(sd));
6508
6450 /* 6509 /*
6451 * Convert topological properties into behaviour. 6510 * Convert topological properties into behaviour.
6452 */ 6511 */
6453 6512
6513 if (sd->flags & SD_ASYM_CPUCAPACITY) {
6514 struct sched_domain *t = sd;
6515
6516 for_each_lower_domain(t)
6517 t->flags |= SD_BALANCE_WAKE;
6518 }
6519
6454 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6520 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6455 sd->flags |= SD_PREFER_SIBLING; 6521 sd->flags |= SD_PREFER_SIBLING;
6456 sd->imbalance_pct = 110; 6522 sd->imbalance_pct = 110;
@@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
6482 sd->idle_idx = 1; 6548 sd->idle_idx = 1;
6483 } 6549 }
6484 6550
6485 sd->private = &tl->data; 6551 /*
6552 * For all levels sharing cache; connect a sched_domain_shared
6553 * instance.
6554 */
6555 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6556 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
6557 atomic_inc(&sd->shared->ref);
6558 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
6559 }
6560
6561 sd->private = sdd;
6486 6562
6487 return sd; 6563 return sd;
6488} 6564}
@@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
6509 6585
6510void set_sched_topology(struct sched_domain_topology_level *tl) 6586void set_sched_topology(struct sched_domain_topology_level *tl)
6511{ 6587{
6588 if (WARN_ON_ONCE(sched_smp_initialized))
6589 return;
6590
6512 sched_domain_topology = tl; 6591 sched_domain_topology = tl;
6513} 6592}
6514 6593
@@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6789 if (!sdd->sd) 6868 if (!sdd->sd)
6790 return -ENOMEM; 6869 return -ENOMEM;
6791 6870
6871 sdd->sds = alloc_percpu(struct sched_domain_shared *);
6872 if (!sdd->sds)
6873 return -ENOMEM;
6874
6792 sdd->sg = alloc_percpu(struct sched_group *); 6875 sdd->sg = alloc_percpu(struct sched_group *);
6793 if (!sdd->sg) 6876 if (!sdd->sg)
6794 return -ENOMEM; 6877 return -ENOMEM;
@@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6799 6882
6800 for_each_cpu(j, cpu_map) { 6883 for_each_cpu(j, cpu_map) {
6801 struct sched_domain *sd; 6884 struct sched_domain *sd;
6885 struct sched_domain_shared *sds;
6802 struct sched_group *sg; 6886 struct sched_group *sg;
6803 struct sched_group_capacity *sgc; 6887 struct sched_group_capacity *sgc;
6804 6888
@@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6809 6893
6810 *per_cpu_ptr(sdd->sd, j) = sd; 6894 *per_cpu_ptr(sdd->sd, j) = sd;
6811 6895
6896 sds = kzalloc_node(sizeof(struct sched_domain_shared),
6897 GFP_KERNEL, cpu_to_node(j));
6898 if (!sds)
6899 return -ENOMEM;
6900
6901 *per_cpu_ptr(sdd->sds, j) = sds;
6902
6812 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6903 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6813 GFP_KERNEL, cpu_to_node(j)); 6904 GFP_KERNEL, cpu_to_node(j));
6814 if (!sg) 6905 if (!sg)
@@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6848 kfree(*per_cpu_ptr(sdd->sd, j)); 6939 kfree(*per_cpu_ptr(sdd->sd, j));
6849 } 6940 }
6850 6941
6942 if (sdd->sds)
6943 kfree(*per_cpu_ptr(sdd->sds, j));
6851 if (sdd->sg) 6944 if (sdd->sg)
6852 kfree(*per_cpu_ptr(sdd->sg, j)); 6945 kfree(*per_cpu_ptr(sdd->sg, j));
6853 if (sdd->sgc) 6946 if (sdd->sgc)
@@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6855 } 6948 }
6856 free_percpu(sdd->sd); 6949 free_percpu(sdd->sd);
6857 sdd->sd = NULL; 6950 sdd->sd = NULL;
6951 free_percpu(sdd->sds);
6952 sdd->sds = NULL;
6858 free_percpu(sdd->sg); 6953 free_percpu(sdd->sg);
6859 sdd->sg = NULL; 6954 sdd->sg = NULL;
6860 free_percpu(sdd->sgc); 6955 free_percpu(sdd->sgc);
@@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6866 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6961 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6867 struct sched_domain *child, int cpu) 6962 struct sched_domain *child, int cpu)
6868{ 6963{
6869 struct sched_domain *sd = sd_init(tl, cpu); 6964 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
6870 if (!sd)
6871 return child;
6872 6965
6873 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6874 if (child) { 6966 if (child) {
6875 sd->level = child->level + 1; 6967 sd->level = child->level + 1;
6876 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6968 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6877 child->parent = sd; 6969 child->parent = sd;
6878 sd->child = child;
6879 6970
6880 if (!cpumask_subset(sched_domain_span(child), 6971 if (!cpumask_subset(sched_domain_span(child),
6881 sched_domain_span(sd))) { 6972 sched_domain_span(sd))) {
@@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6906 enum s_alloc alloc_state; 6997 enum s_alloc alloc_state;
6907 struct sched_domain *sd; 6998 struct sched_domain *sd;
6908 struct s_data d; 6999 struct s_data d;
7000 struct rq *rq = NULL;
6909 int i, ret = -ENOMEM; 7001 int i, ret = -ENOMEM;
6910 7002
6911 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7003 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6956 /* Attach the domains */ 7048 /* Attach the domains */
6957 rcu_read_lock(); 7049 rcu_read_lock();
6958 for_each_cpu(i, cpu_map) { 7050 for_each_cpu(i, cpu_map) {
7051 rq = cpu_rq(i);
6959 sd = *per_cpu_ptr(d.sd, i); 7052 sd = *per_cpu_ptr(d.sd, i);
7053
7054 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
7055 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
7056 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
7057
6960 cpu_attach_domain(sd, d.rd, i); 7058 cpu_attach_domain(sd, d.rd, i);
6961 } 7059 }
6962 rcu_read_unlock(); 7060 rcu_read_unlock();
6963 7061
7062 if (rq && sched_debug_enabled) {
7063 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
7064 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
7065 }
7066
6964 ret = 0; 7067 ret = 0;
6965error: 7068error:
6966 __free_domain_allocs(&d, alloc_state, cpu_map); 7069 __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu)
7319} 7422}
7320#endif 7423#endif
7321 7424
7425#ifdef CONFIG_SCHED_SMT
7426DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7427
7428static void sched_init_smt(void)
7429{
7430 /*
7431 * We've enumerated all CPUs and will assume that if any CPU
7432 * has SMT siblings, CPU0 will too.
7433 */
7434 if (cpumask_weight(cpu_smt_mask(0)) > 1)
7435 static_branch_enable(&sched_smt_present);
7436}
7437#else
7438static inline void sched_init_smt(void) { }
7439#endif
7440
7322void __init sched_init_smp(void) 7441void __init sched_init_smp(void)
7323{ 7442{
7324 cpumask_var_t non_isolated_cpus; 7443 cpumask_var_t non_isolated_cpus;
@@ -7348,6 +7467,9 @@ void __init sched_init_smp(void)
7348 7467
7349 init_sched_rt_class(); 7468 init_sched_rt_class();
7350 init_sched_dl_class(); 7469 init_sched_dl_class();
7470
7471 sched_init_smt();
7472
7351 sched_smp_initialized = true; 7473 sched_smp_initialized = true;
7352} 7474}
7353 7475
@@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly;
7385#endif 7507#endif
7386 7508
7387DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7509DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7510DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
7511
7512#define WAIT_TABLE_BITS 8
7513#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
7514static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
7515
7516wait_queue_head_t *bit_waitqueue(void *word, int bit)
7517{
7518 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
7519 unsigned long val = (unsigned long)word << shift | bit;
7520
7521 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
7522}
7523EXPORT_SYMBOL(bit_waitqueue);
7388 7524
7389void __init sched_init(void) 7525void __init sched_init(void)
7390{ 7526{
7391 int i, j; 7527 int i, j;
7392 unsigned long alloc_size = 0, ptr; 7528 unsigned long alloc_size = 0, ptr;
7393 7529
7530 for (i = 0; i < WAIT_TABLE_SIZE; i++)
7531 init_waitqueue_head(bit_wait_table + i);
7532
7394#ifdef CONFIG_FAIR_GROUP_SCHED 7533#ifdef CONFIG_FAIR_GROUP_SCHED
7395 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7534 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7396#endif 7535#endif
@@ -7421,6 +7560,8 @@ void __init sched_init(void)
7421 for_each_possible_cpu(i) { 7560 for_each_possible_cpu(i) {
7422 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7561 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7423 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7562 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7563 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7564 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7424 } 7565 }
7425#endif /* CONFIG_CPUMASK_OFFSTACK */ 7566#endif /* CONFIG_CPUMASK_OFFSTACK */
7426 7567
@@ -7523,10 +7664,6 @@ void __init sched_init(void)
7523 7664
7524 set_load_weight(&init_task); 7665 set_load_weight(&init_task);
7525 7666
7526#ifdef CONFIG_PREEMPT_NOTIFIERS
7527 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7528#endif
7529
7530 /* 7667 /*
7531 * The boot idle thread does lazy MMU switching as well: 7668 * The boot idle thread does lazy MMU switching as well:
7532 */ 7669 */
@@ -7534,11 +7671,6 @@ void __init sched_init(void)
7534 enter_lazy_tlb(&init_mm, current); 7671 enter_lazy_tlb(&init_mm, current);
7535 7672
7536 /* 7673 /*
7537 * During early bootup we pretend to be a normal task:
7538 */
7539 current->sched_class = &fair_sched_class;
7540
7541 /*
7542 * Make us the idle thread. Technically, schedule() should not be 7674 * Make us the idle thread. Technically, schedule() should not be
7543 * called from this thread, however somewhere below it might be, 7675 * called from this thread, however somewhere below it might be,
7544 * but because we are the idle thread, we just pick up running again 7676 * but because we are the idle thread, we just pick up running again
@@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep);
7592void ___might_sleep(const char *file, int line, int preempt_offset) 7724void ___might_sleep(const char *file, int line, int preempt_offset)
7593{ 7725{
7594 static unsigned long prev_jiffy; /* ratelimiting */ 7726 static unsigned long prev_jiffy; /* ratelimiting */
7727 unsigned long preempt_disable_ip;
7595 7728
7596 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7729 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7597 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7730 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7602 return; 7735 return;
7603 prev_jiffy = jiffies; 7736 prev_jiffy = jiffies;
7604 7737
7738 /* Save this before calling printk(), since that will clobber it */
7739 preempt_disable_ip = get_preempt_disable_ip(current);
7740
7605 printk(KERN_ERR 7741 printk(KERN_ERR
7606 "BUG: sleeping function called from invalid context at %s:%d\n", 7742 "BUG: sleeping function called from invalid context at %s:%d\n",
7607 file, line); 7743 file, line);
@@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7616 debug_show_held_locks(current); 7752 debug_show_held_locks(current);
7617 if (irqs_disabled()) 7753 if (irqs_disabled())
7618 print_irqtrace_events(current); 7754 print_irqtrace_events(current);
7619#ifdef CONFIG_DEBUG_PREEMPT 7755 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
7620 if (!preempt_count_equals(preempt_offset)) { 7756 && !preempt_count_equals(preempt_offset)) {
7621 pr_err("Preemption disabled at:"); 7757 pr_err("Preemption disabled at:");
7622 print_ip_sym(current->preempt_disable_ip); 7758 print_ip_sym(preempt_disable_ip);
7623 pr_cont("\n"); 7759 pr_cont("\n");
7624 } 7760 }
7625#endif
7626 dump_stack(); 7761 dump_stack();
7762 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7627} 7763}
7628EXPORT_SYMBOL(___might_sleep); 7764EXPORT_SYMBOL(___might_sleep);
7629#endif 7765#endif
@@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void)
7644 if (p->flags & PF_KTHREAD) 7780 if (p->flags & PF_KTHREAD)
7645 continue; 7781 continue;
7646 7782
7647 p->se.exec_start = 0; 7783 p->se.exec_start = 0;
7648#ifdef CONFIG_SCHEDSTATS 7784 schedstat_set(p->se.statistics.wait_start, 0);
7649 p->se.statistics.wait_start = 0; 7785 schedstat_set(p->se.statistics.sleep_start, 0);
7650 p->se.statistics.sleep_start = 0; 7786 schedstat_set(p->se.statistics.block_start, 0);
7651 p->se.statistics.block_start = 0;
7652#endif
7653 7787
7654 if (!dl_task(p) && !rt_task(p)) { 7788 if (!dl_task(p) && !rt_task(p)) {
7655 /* 7789 /*
@@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu)
7710 * 7844 *
7711 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7845 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7712 */ 7846 */
7713void set_curr_task(int cpu, struct task_struct *p) 7847void ia64_set_curr_task(int cpu, struct task_struct *p)
7714{ 7848{
7715 cpu_curr(cpu) = p; 7849 cpu_curr(cpu) = p;
7716} 7850}
@@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk)
7841 7975
7842 sched_change_group(tsk, TASK_MOVE_GROUP); 7976 sched_change_group(tsk, TASK_MOVE_GROUP);
7843 7977
7844 if (unlikely(running))
7845 tsk->sched_class->set_curr_task(rq);
7846 if (queued) 7978 if (queued)
7847 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 7979 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
7980 if (unlikely(running))
7981 set_curr_task(rq, tsk);
7848 7982
7849 task_rq_unlock(rq, tsk, &rf); 7983 task_rq_unlock(rq, tsk, &rf);
7850} 7984}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d4184498c9f5..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
31 return (i << 1) + 2; 31 return (i << 1) + 2;
32} 32}
33 33
34static void cpudl_exchange(struct cpudl *cp, int a, int b) 34static void cpudl_heapify_down(struct cpudl *cp, int idx)
35{ 35{
36 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 36 int l, r, largest;
37 37
38 swap(cp->elements[a].cpu, cp->elements[b].cpu); 38 int orig_cpu = cp->elements[idx].cpu;
39 swap(cp->elements[a].dl , cp->elements[b].dl ); 39 u64 orig_dl = cp->elements[idx].dl;
40 40
41 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); 41 if (left_child(idx) >= cp->size)
42} 42 return;
43
44static void cpudl_heapify(struct cpudl *cp, int idx)
45{
46 int l, r, largest;
47 43
48 /* adapted from lib/prio_heap.c */ 44 /* adapted from lib/prio_heap.c */
49 while(1) { 45 while(1) {
46 u64 largest_dl;
50 l = left_child(idx); 47 l = left_child(idx);
51 r = right_child(idx); 48 r = right_child(idx);
52 largest = idx; 49 largest = idx;
50 largest_dl = orig_dl;
53 51
54 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, 52 if ((l < cp->size) && dl_time_before(orig_dl,
55 cp->elements[l].dl)) 53 cp->elements[l].dl)) {
56 largest = l; 54 largest = l;
57 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, 55 largest_dl = cp->elements[l].dl;
58 cp->elements[r].dl)) 56 }
57 if ((r < cp->size) && dl_time_before(largest_dl,
58 cp->elements[r].dl))
59 largest = r; 59 largest = r;
60
60 if (largest == idx) 61 if (largest == idx)
61 break; 62 break;
62 63
63 /* Push idx down the heap one level and bump one up */ 64 /* pull largest child onto idx */
64 cpudl_exchange(cp, largest, idx); 65 cp->elements[idx].cpu = cp->elements[largest].cpu;
66 cp->elements[idx].dl = cp->elements[largest].dl;
67 cp->elements[cp->elements[idx].cpu].idx = idx;
65 idx = largest; 68 idx = largest;
66 } 69 }
70 /* actual push down of saved original values orig_* */
71 cp->elements[idx].cpu = orig_cpu;
72 cp->elements[idx].dl = orig_dl;
73 cp->elements[cp->elements[idx].cpu].idx = idx;
67} 74}
68 75
69static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) 76static void cpudl_heapify_up(struct cpudl *cp, int idx)
70{ 77{
71 WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); 78 int p;
72 79
73 if (dl_time_before(new_dl, cp->elements[idx].dl)) { 80 int orig_cpu = cp->elements[idx].cpu;
74 cp->elements[idx].dl = new_dl; 81 u64 orig_dl = cp->elements[idx].dl;
75 cpudl_heapify(cp, idx); 82
76 } else { 83 if (idx == 0)
77 cp->elements[idx].dl = new_dl; 84 return;
78 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, 85
79 cp->elements[idx].dl)) { 86 do {
80 cpudl_exchange(cp, idx, parent(idx)); 87 p = parent(idx);
81 idx = parent(idx); 88 if (dl_time_before(orig_dl, cp->elements[p].dl))
82 } 89 break;
83 } 90 /* pull parent onto idx */
91 cp->elements[idx].cpu = cp->elements[p].cpu;
92 cp->elements[idx].dl = cp->elements[p].dl;
93 cp->elements[cp->elements[idx].cpu].idx = idx;
94 idx = p;
95 } while (idx != 0);
96 /* actual push up of saved original values orig_* */
97 cp->elements[idx].cpu = orig_cpu;
98 cp->elements[idx].dl = orig_dl;
99 cp->elements[cp->elements[idx].cpu].idx = idx;
100}
101
102static void cpudl_heapify(struct cpudl *cp, int idx)
103{
104 if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
105 cp->elements[idx].dl))
106 cpudl_heapify_up(cp, idx);
107 else
108 cpudl_heapify_down(cp, idx);
84} 109}
85 110
86static inline int cpudl_maximum(struct cpudl *cp) 111static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
120} 145}
121 146
122/* 147/*
123 * cpudl_set - update the cpudl max-heap 148 * cpudl_clear - remove a cpu from the cpudl max-heap
124 * @cp: the cpudl max-heap context 149 * @cp: the cpudl max-heap context
125 * @cpu: the target cpu 150 * @cpu: the target cpu
126 * @dl: the new earliest deadline for this cpu
127 * 151 *
128 * Notes: assumes cpu_rq(cpu)->lock is locked 152 * Notes: assumes cpu_rq(cpu)->lock is locked
129 * 153 *
130 * Returns: (void) 154 * Returns: (void)
131 */ 155 */
132void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) 156void cpudl_clear(struct cpudl *cp, int cpu)
133{ 157{
134 int old_idx, new_cpu; 158 int old_idx, new_cpu;
135 unsigned long flags; 159 unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
137 WARN_ON(!cpu_present(cpu)); 161 WARN_ON(!cpu_present(cpu));
138 162
139 raw_spin_lock_irqsave(&cp->lock, flags); 163 raw_spin_lock_irqsave(&cp->lock, flags);
164
140 old_idx = cp->elements[cpu].idx; 165 old_idx = cp->elements[cpu].idx;
141 if (!is_valid) { 166 if (old_idx == IDX_INVALID) {
142 /* remove item */ 167 /*
143 if (old_idx == IDX_INVALID) { 168 * Nothing to remove if old_idx was invalid.
144 /* 169 * This could happen if a rq_offline_dl is
145 * Nothing to remove if old_idx was invalid. 170 * called for a CPU without -dl tasks running.
146 * This could happen if a rq_offline_dl is 171 */
147 * called for a CPU without -dl tasks running. 172 } else {
148 */
149 goto out;
150 }
151 new_cpu = cp->elements[cp->size - 1].cpu; 173 new_cpu = cp->elements[cp->size - 1].cpu;
152 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 174 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
153 cp->elements[old_idx].cpu = new_cpu; 175 cp->elements[old_idx].cpu = new_cpu;
154 cp->size--; 176 cp->size--;
155 cp->elements[new_cpu].idx = old_idx; 177 cp->elements[new_cpu].idx = old_idx;
156 cp->elements[cpu].idx = IDX_INVALID; 178 cp->elements[cpu].idx = IDX_INVALID;
157 while (old_idx > 0 && dl_time_before( 179 cpudl_heapify(cp, old_idx);
158 cp->elements[parent(old_idx)].dl,
159 cp->elements[old_idx].dl)) {
160 cpudl_exchange(cp, old_idx, parent(old_idx));
161 old_idx = parent(old_idx);
162 }
163 cpumask_set_cpu(cpu, cp->free_cpus);
164 cpudl_heapify(cp, old_idx);
165 180
166 goto out; 181 cpumask_set_cpu(cpu, cp->free_cpus);
167 } 182 }
183 raw_spin_unlock_irqrestore(&cp->lock, flags);
184}
185
186/*
187 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu
190 * @dl: the new earliest deadline for this cpu
191 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked
193 *
194 * Returns: (void)
195 */
196void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
197{
198 int old_idx;
199 unsigned long flags;
168 200
201 WARN_ON(!cpu_present(cpu));
202
203 raw_spin_lock_irqsave(&cp->lock, flags);
204
205 old_idx = cp->elements[cpu].idx;
169 if (old_idx == IDX_INVALID) { 206 if (old_idx == IDX_INVALID) {
170 cp->size++; 207 int new_idx = cp->size++;
171 cp->elements[cp->size - 1].dl = dl; 208 cp->elements[new_idx].dl = dl;
172 cp->elements[cp->size - 1].cpu = cpu; 209 cp->elements[new_idx].cpu = cpu;
173 cp->elements[cpu].idx = cp->size - 1; 210 cp->elements[cpu].idx = new_idx;
174 cpudl_change_key(cp, cp->size - 1, dl); 211 cpudl_heapify_up(cp, new_idx);
175 cpumask_clear_cpu(cpu, cp->free_cpus); 212 cpumask_clear_cpu(cpu, cp->free_cpus);
176 } else { 213 } else {
177 cpudl_change_key(cp, old_idx, dl); 214 cp->elements[old_idx].dl = dl;
215 cpudl_heapify(cp, old_idx);
178 } 216 }
179 217
180out:
181 raw_spin_unlock_irqrestore(&cp->lock, flags); 218 raw_spin_unlock_irqrestore(&cp->lock, flags);
182} 219}
183 220
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
23#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
24int cpudl_find(struct cpudl *cp, struct task_struct *p, 24int cpudl_find(struct cpudl *cp, struct task_struct *p,
25 struct cpumask *later_mask); 25 struct cpumask *later_mask);
26void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 26void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
27void cpudl_clear(struct cpudl *cp, int cpu);
27int cpudl_init(struct cpudl *cp); 28int cpudl_init(struct cpudl *cp);
28void cpudl_set_freecpu(struct cpudl *cp, int cpu); 29void cpudl_set_freecpu(struct cpudl *cp, int cpu);
29void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 30void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
33 */ 33 */
34void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, 34void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
35 void (*func)(struct update_util_data *data, u64 time, 35 void (*func)(struct update_util_data *data, u64 time,
36 unsigned long util, unsigned long max)) 36 unsigned int flags))
37{ 37{
38 if (WARN_ON(!data || !func)) 38 if (WARN_ON(!data || !func))
39 return; 39 return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b222c1..69e06898997d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,7 +12,6 @@
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/module.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <trace/events/power.h> 16#include <trace/events/power.h>
18 17
@@ -48,11 +47,14 @@ struct sugov_cpu {
48 struct sugov_policy *sg_policy; 47 struct sugov_policy *sg_policy;
49 48
50 unsigned int cached_raw_freq; 49 unsigned int cached_raw_freq;
50 unsigned long iowait_boost;
51 unsigned long iowait_boost_max;
52 u64 last_update;
51 53
52 /* The fields below are only needed when sharing a policy. */ 54 /* The fields below are only needed when sharing a policy. */
53 unsigned long util; 55 unsigned long util;
54 unsigned long max; 56 unsigned long max;
55 u64 last_update; 57 unsigned int flags;
56}; 58};
57 59
58static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 60static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
144 return cpufreq_driver_resolve_freq(policy, freq); 146 return cpufreq_driver_resolve_freq(policy, freq);
145} 147}
146 148
149static void sugov_get_util(unsigned long *util, unsigned long *max)
150{
151 struct rq *rq = this_rq();
152 unsigned long cfs_max;
153
154 cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
155
156 *util = min(rq->cfs.avg.util_avg, cfs_max);
157 *max = cfs_max;
158}
159
160static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
161 unsigned int flags)
162{
163 if (flags & SCHED_CPUFREQ_IOWAIT) {
164 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
165 } else if (sg_cpu->iowait_boost) {
166 s64 delta_ns = time - sg_cpu->last_update;
167
168 /* Clear iowait_boost if the CPU apprears to have been idle. */
169 if (delta_ns > TICK_NSEC)
170 sg_cpu->iowait_boost = 0;
171 }
172}
173
174static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
175 unsigned long *max)
176{
177 unsigned long boost_util = sg_cpu->iowait_boost;
178 unsigned long boost_max = sg_cpu->iowait_boost_max;
179
180 if (!boost_util)
181 return;
182
183 if (*util * boost_max < *max * boost_util) {
184 *util = boost_util;
185 *max = boost_max;
186 }
187 sg_cpu->iowait_boost >>= 1;
188}
189
147static void sugov_update_single(struct update_util_data *hook, u64 time, 190static void sugov_update_single(struct update_util_data *hook, u64 time,
148 unsigned long util, unsigned long max) 191 unsigned int flags)
149{ 192{
150 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 193 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
151 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 194 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
152 struct cpufreq_policy *policy = sg_policy->policy; 195 struct cpufreq_policy *policy = sg_policy->policy;
196 unsigned long util, max;
153 unsigned int next_f; 197 unsigned int next_f;
154 198
199 sugov_set_iowait_boost(sg_cpu, time, flags);
200 sg_cpu->last_update = time;
201
155 if (!sugov_should_update_freq(sg_policy, time)) 202 if (!sugov_should_update_freq(sg_policy, time))
156 return; 203 return;
157 204
158 next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : 205 if (flags & SCHED_CPUFREQ_RT_DL) {
159 get_next_freq(sg_cpu, util, max); 206 next_f = policy->cpuinfo.max_freq;
207 } else {
208 sugov_get_util(&util, &max);
209 sugov_iowait_boost(sg_cpu, &util, &max);
210 next_f = get_next_freq(sg_cpu, util, max);
211 }
160 sugov_update_commit(sg_policy, time, next_f); 212 sugov_update_commit(sg_policy, time, next_f);
161} 213}
162 214
163static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, 215static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
164 unsigned long util, unsigned long max) 216 unsigned long util, unsigned long max,
217 unsigned int flags)
165{ 218{
166 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 219 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
167 struct cpufreq_policy *policy = sg_policy->policy; 220 struct cpufreq_policy *policy = sg_policy->policy;
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
169 u64 last_freq_update_time = sg_policy->last_freq_update_time; 222 u64 last_freq_update_time = sg_policy->last_freq_update_time;
170 unsigned int j; 223 unsigned int j;
171 224
172 if (util == ULONG_MAX) 225 if (flags & SCHED_CPUFREQ_RT_DL)
173 return max_f; 226 return max_f;
174 227
228 sugov_iowait_boost(sg_cpu, &util, &max);
229
175 for_each_cpu(j, policy->cpus) { 230 for_each_cpu(j, policy->cpus) {
176 struct sugov_cpu *j_sg_cpu; 231 struct sugov_cpu *j_sg_cpu;
177 unsigned long j_util, j_max; 232 unsigned long j_util, j_max;
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
186 * frequency update and the time elapsed between the last update 241 * frequency update and the time elapsed between the last update
187 * of the CPU utilization and the last frequency update is long 242 * of the CPU utilization and the last frequency update is long
188 * enough, don't take the CPU into account as it probably is 243 * enough, don't take the CPU into account as it probably is
189 * idle now. 244 * idle now (and clear iowait_boost for it).
190 */ 245 */
191 delta_ns = last_freq_update_time - j_sg_cpu->last_update; 246 delta_ns = last_freq_update_time - j_sg_cpu->last_update;
192 if (delta_ns > TICK_NSEC) 247 if (delta_ns > TICK_NSEC) {
248 j_sg_cpu->iowait_boost = 0;
193 continue; 249 continue;
194 250 }
195 j_util = j_sg_cpu->util; 251 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
196 if (j_util == ULONG_MAX)
197 return max_f; 252 return max_f;
198 253
254 j_util = j_sg_cpu->util;
199 j_max = j_sg_cpu->max; 255 j_max = j_sg_cpu->max;
200 if (j_util * max > j_max * util) { 256 if (j_util * max > j_max * util) {
201 util = j_util; 257 util = j_util;
202 max = j_max; 258 max = j_max;
203 } 259 }
260
261 sugov_iowait_boost(j_sg_cpu, &util, &max);
204 } 262 }
205 263
206 return get_next_freq(sg_cpu, util, max); 264 return get_next_freq(sg_cpu, util, max);
207} 265}
208 266
209static void sugov_update_shared(struct update_util_data *hook, u64 time, 267static void sugov_update_shared(struct update_util_data *hook, u64 time,
210 unsigned long util, unsigned long max) 268 unsigned int flags)
211{ 269{
212 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 270 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
213 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 271 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
272 unsigned long util, max;
214 unsigned int next_f; 273 unsigned int next_f;
215 274
275 sugov_get_util(&util, &max);
276
216 raw_spin_lock(&sg_policy->update_lock); 277 raw_spin_lock(&sg_policy->update_lock);
217 278
218 sg_cpu->util = util; 279 sg_cpu->util = util;
219 sg_cpu->max = max; 280 sg_cpu->max = max;
281 sg_cpu->flags = flags;
282
283 sugov_set_iowait_boost(sg_cpu, time, flags);
220 sg_cpu->last_update = time; 284 sg_cpu->last_update = time;
221 285
222 if (sugov_should_update_freq(sg_policy, time)) { 286 if (sugov_should_update_freq(sg_policy, time)) {
223 next_f = sugov_next_freq_shared(sg_cpu, util, max); 287 next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
224 sugov_update_commit(sg_policy, time, next_f); 288 sugov_update_commit(sg_policy, time, next_f);
225 } 289 }
226 290
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy)
444 508
445 sg_cpu->sg_policy = sg_policy; 509 sg_cpu->sg_policy = sg_policy;
446 if (policy_is_shared(policy)) { 510 if (policy_is_shared(policy)) {
447 sg_cpu->util = ULONG_MAX; 511 sg_cpu->util = 0;
448 sg_cpu->max = 0; 512 sg_cpu->max = 0;
513 sg_cpu->flags = SCHED_CPUFREQ_RT;
449 sg_cpu->last_update = 0; 514 sg_cpu->last_update = 0;
450 sg_cpu->cached_raw_freq = 0; 515 sg_cpu->cached_raw_freq = 0;
516 sg_cpu->iowait_boost = 0;
517 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
451 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 518 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
452 sugov_update_shared); 519 sugov_update_shared);
453 } else { 520 } else {
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = {
495 .limits = sugov_limits, 562 .limits = sugov_limits,
496}; 563};
497 564
498static int __init sugov_module_init(void)
499{
500 return cpufreq_register_governor(&schedutil_gov);
501}
502
503static void __exit sugov_module_exit(void)
504{
505 cpufreq_unregister_governor(&schedutil_gov);
506}
507
508MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
509MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
510MODULE_LICENSE("GPL");
511
512#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 565#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
513struct cpufreq_governor *cpufreq_default_governor(void) 566struct cpufreq_governor *cpufreq_default_governor(void)
514{ 567{
515 return &schedutil_gov; 568 return &schedutil_gov;
516} 569}
517
518fs_initcall(sugov_module_init);
519#else
520module_init(sugov_module_init);
521#endif 570#endif
522module_exit(sugov_module_exit); 571
572static int __init sugov_register(void)
573{
574 return cpufreq_register_governor(&schedutil_gov);
575}
576fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a846cf89eb96..5ebee3164e64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
23 * task when irq is in progress while we read rq->clock. That is a worthy 23 * task when irq is in progress while we read rq->clock. That is a worthy
24 * compromise in place of having locks on each irq in account_system_time. 24 * compromise in place of having locks on each irq in account_system_time.
25 */ 25 */
26DEFINE_PER_CPU(u64, cpu_hardirq_time); 26DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
27DEFINE_PER_CPU(u64, cpu_softirq_time);
28 27
29static DEFINE_PER_CPU(u64, irq_start_time);
30static int sched_clock_irqtime; 28static int sched_clock_irqtime;
31 29
32void enable_sched_clock_irqtime(void) 30void enable_sched_clock_irqtime(void)
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
39 sched_clock_irqtime = 0; 37 sched_clock_irqtime = 0;
40} 38}
41 39
42#ifndef CONFIG_64BIT
43DEFINE_PER_CPU(seqcount_t, irq_time_seq);
44#endif /* CONFIG_64BIT */
45
46/* 40/*
47 * Called before incrementing preempt_count on {soft,}irq_enter 41 * Called before incrementing preempt_count on {soft,}irq_enter
48 * and before decrementing preempt_count on {soft,}irq_exit. 42 * and before decrementing preempt_count on {soft,}irq_exit.
49 */ 43 */
50void irqtime_account_irq(struct task_struct *curr) 44void irqtime_account_irq(struct task_struct *curr)
51{ 45{
46 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
52 s64 delta; 47 s64 delta;
53 int cpu; 48 int cpu;
54 49
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
56 return; 51 return;
57 52
58 cpu = smp_processor_id(); 53 cpu = smp_processor_id();
59 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 54 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
60 __this_cpu_add(irq_start_time, delta); 55 irqtime->irq_start_time += delta;
61 56
62 irq_time_write_begin(); 57 u64_stats_update_begin(&irqtime->sync);
63 /* 58 /*
64 * We do not account for softirq time from ksoftirqd here. 59 * We do not account for softirq time from ksoftirqd here.
65 * We want to continue accounting softirq time to ksoftirqd thread 60 * We want to continue accounting softirq time to ksoftirqd thread
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
67 * that do not consume any time, but still wants to run. 62 * that do not consume any time, but still wants to run.
68 */ 63 */
69 if (hardirq_count()) 64 if (hardirq_count())
70 __this_cpu_add(cpu_hardirq_time, delta); 65 irqtime->hardirq_time += delta;
71 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) 66 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
72 __this_cpu_add(cpu_softirq_time, delta); 67 irqtime->softirq_time += delta;
73 68
74 irq_time_write_end(); 69 u64_stats_update_end(&irqtime->sync);
75} 70}
76EXPORT_SYMBOL_GPL(irqtime_account_irq); 71EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 72
78static cputime_t irqtime_account_hi_update(cputime_t maxtime) 73static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
79{ 74{
80 u64 *cpustat = kcpustat_this_cpu->cpustat; 75 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 cputime_t irq_cputime; 76 cputime_t irq_cputime;
83 77
84 local_irq_save(flags); 78 irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
85 irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
86 cpustat[CPUTIME_IRQ];
87 irq_cputime = min(irq_cputime, maxtime); 79 irq_cputime = min(irq_cputime, maxtime);
88 cpustat[CPUTIME_IRQ] += irq_cputime; 80 cpustat[idx] += irq_cputime;
89 local_irq_restore(flags); 81
90 return irq_cputime; 82 return irq_cputime;
91} 83}
92 84
93static cputime_t irqtime_account_si_update(cputime_t maxtime) 85static cputime_t irqtime_account_hi_update(cputime_t maxtime)
94{ 86{
95 u64 *cpustat = kcpustat_this_cpu->cpustat; 87 return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
96 unsigned long flags; 88 CPUTIME_IRQ, maxtime);
97 cputime_t softirq_cputime; 89}
98 90
99 local_irq_save(flags); 91static cputime_t irqtime_account_si_update(cputime_t maxtime)
100 softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - 92{
101 cpustat[CPUTIME_SOFTIRQ]; 93 return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
102 softirq_cputime = min(softirq_cputime, maxtime); 94 CPUTIME_SOFTIRQ, maxtime);
103 cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
104 local_irq_restore(flags);
105 return softirq_cputime;
106} 95}
107 96
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 97#else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
295{ 284{
296 cputime_t accounted; 285 cputime_t accounted;
297 286
287 /* Shall be converted to a lockdep-enabled lightweight check */
288 WARN_ON_ONCE(!irqs_disabled());
289
298 accounted = steal_account_process_time(max); 290 accounted = steal_account_process_time(max);
299 291
300 if (accounted < max) 292 if (accounted < max)
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
306 return accounted; 298 return accounted;
307} 299}
308 300
301#ifdef CONFIG_64BIT
302static inline u64 read_sum_exec_runtime(struct task_struct *t)
303{
304 return t->se.sum_exec_runtime;
305}
306#else
307static u64 read_sum_exec_runtime(struct task_struct *t)
308{
309 u64 ns;
310 struct rq_flags rf;
311 struct rq *rq;
312
313 rq = task_rq_lock(t, &rf);
314 ns = t->se.sum_exec_runtime;
315 task_rq_unlock(rq, t, &rf);
316
317 return ns;
318}
319#endif
320
309/* 321/*
310 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live 322 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
311 * tasks (sum on group iteration) belonging to @tsk's group. 323 * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
318 unsigned int seq, nextseq; 330 unsigned int seq, nextseq;
319 unsigned long flags; 331 unsigned long flags;
320 332
333 /*
334 * Update current task runtime to account pending time since last
335 * scheduler action or thread_group_cputime() call. This thread group
336 * might have other running tasks on different CPUs, but updating
337 * their runtime can affect syscall performance, so we skip account
338 * those pending times and rely only on values updated on tick or
339 * other scheduler action.
340 */
341 if (same_thread_group(current, tsk))
342 (void) task_sched_runtime(current);
343
321 rcu_read_lock(); 344 rcu_read_lock();
322 /* Attempt a lockless read on the first round. */ 345 /* Attempt a lockless read on the first round. */
323 nextseq = 0; 346 nextseq = 0;
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
332 task_cputime(t, &utime, &stime); 355 task_cputime(t, &utime, &stime);
333 times->utime += utime; 356 times->utime += utime;
334 times->stime += stime; 357 times->stime += stime;
335 times->sum_exec_runtime += task_sched_runtime(t); 358 times->sum_exec_runtime += read_sum_exec_runtime(t);
336 } 359 }
337 /* If lockless access failed, take the lock. */ 360 /* If lockless access failed, take the lock. */
338 nextseq = 1; 361 nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867283dc..37e2449186c4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
243static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) 243static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
244{ 244{
245 struct rq *later_rq = NULL; 245 struct rq *later_rq = NULL;
246 bool fallback = false;
247 246
248 later_rq = find_lock_later_rq(p, rq); 247 later_rq = find_lock_later_rq(p, rq);
249
250 if (!later_rq) { 248 if (!later_rq) {
251 int cpu; 249 int cpu;
252 250
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
254 * If we cannot preempt any rq, fall back to pick any 252 * If we cannot preempt any rq, fall back to pick any
255 * online cpu. 253 * online cpu.
256 */ 254 */
257 fallback = true;
258 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); 255 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
259 if (cpu >= nr_cpu_ids) { 256 if (cpu >= nr_cpu_ids) {
260 /* 257 /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
274 double_lock_balance(rq, later_rq); 271 double_lock_balance(rq, later_rq);
275 } 272 }
276 273
277 /*
278 * By now the task is replenished and enqueued; migrate it.
279 */
280 deactivate_task(rq, p, 0);
281 set_task_cpu(p, later_rq->cpu); 274 set_task_cpu(p, later_rq->cpu);
282 activate_task(later_rq, p, 0);
283
284 if (!fallback)
285 resched_curr(later_rq);
286
287 double_unlock_balance(later_rq, rq); 275 double_unlock_balance(later_rq, rq);
288 276
289 return later_rq; 277 return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
346 * one, and to (try to!) reconcile itself with its own scheduling 334 * one, and to (try to!) reconcile itself with its own scheduling
347 * parameters. 335 * parameters.
348 */ 336 */
349static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, 337static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
350 struct sched_dl_entity *pi_se)
351{ 338{
352 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 339 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
353 struct rq *rq = rq_of_dl_rq(dl_rq); 340 struct rq *rq = rq_of_dl_rq(dl_rq);
354 341
342 WARN_ON(dl_se->dl_boosted);
355 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); 343 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
356 344
357 /* 345 /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
367 * future; in fact, we must consider execution overheads (time 355 * future; in fact, we must consider execution overheads (time
368 * spent on hardirq context, etc.). 356 * spent on hardirq context, etc.).
369 */ 357 */
370 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 358 dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
371 dl_se->runtime = pi_se->dl_runtime; 359 dl_se->runtime = dl_se->dl_runtime;
372} 360}
373 361
374/* 362/*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
641 goto unlock; 629 goto unlock;
642 } 630 }
643 631
644 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
645 if (dl_task(rq->curr))
646 check_preempt_curr_dl(rq, p, 0);
647 else
648 resched_curr(rq);
649
650#ifdef CONFIG_SMP 632#ifdef CONFIG_SMP
651 /*
652 * Perform balancing operations here; after the replenishments. We
653 * cannot drop rq->lock before this, otherwise the assertion in
654 * start_dl_timer() about not missing updates is not true.
655 *
656 * If we find that the rq the task was on is no longer available, we
657 * need to select a new rq.
658 *
659 * XXX figure out if select_task_rq_dl() deals with offline cpus.
660 */
661 if (unlikely(!rq->online)) { 633 if (unlikely(!rq->online)) {
634 /*
635 * If the runqueue is no longer available, migrate the
636 * task elsewhere. This necessarily changes rq.
637 */
662 lockdep_unpin_lock(&rq->lock, rf.cookie); 638 lockdep_unpin_lock(&rq->lock, rf.cookie);
663 rq = dl_task_offline_migration(rq, p); 639 rq = dl_task_offline_migration(rq, p);
664 rf.cookie = lockdep_pin_lock(&rq->lock); 640 rf.cookie = lockdep_pin_lock(&rq->lock);
641
642 /*
643 * Now that the task has been migrated to the new RQ and we
644 * have that locked, proceed as normal and enqueue the task
645 * there.
646 */
665 } 647 }
648#endif
649
650 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
651 if (dl_task(rq->curr))
652 check_preempt_curr_dl(rq, p, 0);
653 else
654 resched_curr(rq);
666 655
656#ifdef CONFIG_SMP
667 /* 657 /*
668 * Queueing this task back might have overloaded rq, check if we need 658 * Queueing this task back might have overloaded rq, check if we need
669 * to kick someone away. 659 * to kick someone away.
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
735 return; 725 return;
736 } 726 }
737 727
738 /* kick cpufreq (see the comment in linux/cpufreq.h). */ 728 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
739 if (cpu_of(rq) == smp_processor_id()) 729 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
740 cpufreq_trigger_update(rq_clock(rq));
741 730
742 schedstat_set(curr->se.statistics.exec_max, 731 schedstat_set(curr->se.statistics.exec_max,
743 max(curr->se.statistics.exec_max, delta_exec)); 732 max(curr->se.statistics.exec_max, delta_exec));
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
798 if (dl_rq->earliest_dl.curr == 0 || 787 if (dl_rq->earliest_dl.curr == 0 ||
799 dl_time_before(deadline, dl_rq->earliest_dl.curr)) { 788 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
800 dl_rq->earliest_dl.curr = deadline; 789 dl_rq->earliest_dl.curr = deadline;
801 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); 790 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
802 } 791 }
803} 792}
804 793
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
813 if (!dl_rq->dl_nr_running) { 802 if (!dl_rq->dl_nr_running) {
814 dl_rq->earliest_dl.curr = 0; 803 dl_rq->earliest_dl.curr = 0;
815 dl_rq->earliest_dl.next = 0; 804 dl_rq->earliest_dl.next = 0;
816 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 805 cpudl_clear(&rq->rd->cpudl, rq->cpu);
817 } else { 806 } else {
818 struct rb_node *leftmost = dl_rq->rb_leftmost; 807 struct rb_node *leftmost = dl_rq->rb_leftmost;
819 struct sched_dl_entity *entry; 808 struct sched_dl_entity *entry;
820 809
821 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); 810 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
822 dl_rq->earliest_dl.curr = entry->deadline; 811 dl_rq->earliest_dl.curr = entry->deadline;
823 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); 812 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
824 } 813 }
825} 814}
826 815
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
1671 1660
1672 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); 1661 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1673 if (rq->dl.dl_nr_running > 0) 1662 if (rq->dl.dl_nr_running > 0)
1674 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1663 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
1675} 1664}
1676 1665
1677/* Assumes rq->lock is held */ 1666/* Assumes rq->lock is held */
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
1680 if (rq->dl.overloaded) 1669 if (rq->dl.overloaded)
1681 dl_clear_overload(rq); 1670 dl_clear_overload(rq);
1682 1671
1683 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1672 cpudl_clear(&rq->rd->cpudl, rq->cpu);
1684 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); 1673 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1685} 1674}
1686 1675
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1723 */ 1712 */
1724static void switched_to_dl(struct rq *rq, struct task_struct *p) 1713static void switched_to_dl(struct rq *rq, struct task_struct *p)
1725{ 1714{
1715
1716 /* If p is not queued we will update its parameters at next wakeup. */
1717 if (!task_on_rq_queued(p))
1718 return;
1719
1720 /*
1721 * If p is boosted we already updated its params in
1722 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
1723 * p's deadline being now already after rq_clock(rq).
1724 */
1726 if (dl_time_before(p->dl.deadline, rq_clock(rq))) 1725 if (dl_time_before(p->dl.deadline, rq_clock(rq)))
1727 setup_new_dl_entity(&p->dl, &p->dl); 1726 setup_new_dl_entity(&p->dl);
1728 1727
1729 if (task_on_rq_queued(p) && rq->curr != p) { 1728 if (rq->curr != p) {
1730#ifdef CONFIG_SMP 1729#ifdef CONFIG_SMP
1731 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) 1730 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
1732 queue_push_tasks(rq); 1731 queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a9995256d..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
369 369
370#define P(F) \ 370#define P(F) \
371 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 371 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
372#define P_SCHEDSTAT(F) \
373 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
372#define PN(F) \ 374#define PN(F) \
373 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 375 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
376#define PN_SCHEDSTAT(F) \
377 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
374 378
375 if (!se) 379 if (!se)
376 return; 380 return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
378 PN(se->exec_start); 382 PN(se->exec_start);
379 PN(se->vruntime); 383 PN(se->vruntime);
380 PN(se->sum_exec_runtime); 384 PN(se->sum_exec_runtime);
381#ifdef CONFIG_SCHEDSTATS
382 if (schedstat_enabled()) { 385 if (schedstat_enabled()) {
383 PN(se->statistics.wait_start); 386 PN_SCHEDSTAT(se->statistics.wait_start);
384 PN(se->statistics.sleep_start); 387 PN_SCHEDSTAT(se->statistics.sleep_start);
385 PN(se->statistics.block_start); 388 PN_SCHEDSTAT(se->statistics.block_start);
386 PN(se->statistics.sleep_max); 389 PN_SCHEDSTAT(se->statistics.sleep_max);
387 PN(se->statistics.block_max); 390 PN_SCHEDSTAT(se->statistics.block_max);
388 PN(se->statistics.exec_max); 391 PN_SCHEDSTAT(se->statistics.exec_max);
389 PN(se->statistics.slice_max); 392 PN_SCHEDSTAT(se->statistics.slice_max);
390 PN(se->statistics.wait_max); 393 PN_SCHEDSTAT(se->statistics.wait_max);
391 PN(se->statistics.wait_sum); 394 PN_SCHEDSTAT(se->statistics.wait_sum);
392 P(se->statistics.wait_count); 395 P_SCHEDSTAT(se->statistics.wait_count);
393 } 396 }
394#endif
395 P(se->load.weight); 397 P(se->load.weight);
396#ifdef CONFIG_SMP 398#ifdef CONFIG_SMP
397 P(se->avg.load_avg); 399 P(se->avg.load_avg);
398 P(se->avg.util_avg); 400 P(se->avg.util_avg);
399#endif 401#endif
402
403#undef PN_SCHEDSTAT
400#undef PN 404#undef PN
405#undef P_SCHEDSTAT
401#undef P 406#undef P
402} 407}
403#endif 408#endif
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
410 if (autogroup_path(tg, group_path, PATH_MAX)) 415 if (autogroup_path(tg, group_path, PATH_MAX))
411 return group_path; 416 return group_path;
412 417
413 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 418 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
419 return group_path;
414} 420}
415#endif 421#endif
416 422
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
429 p->prio); 435 p->prio);
430 436
431 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 437 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
432 SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), 438 SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
433 SPLIT_NS(p->se.sum_exec_runtime), 439 SPLIT_NS(p->se.sum_exec_runtime),
434 SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); 440 SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
435 441
436#ifdef CONFIG_NUMA_BALANCING 442#ifdef CONFIG_NUMA_BALANCING
437 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 443 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +632,7 @@ do { \
626#undef P64 632#undef P64
627#endif 633#endif
628 634
629#ifdef CONFIG_SCHEDSTATS 635#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
630#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
631
632 if (schedstat_enabled()) { 636 if (schedstat_enabled()) {
633 P(yld_count); 637 P(yld_count);
634 P(sched_count); 638 P(sched_count);
@@ -636,9 +640,8 @@ do { \
636 P(ttwu_count); 640 P(ttwu_count);
637 P(ttwu_local); 641 P(ttwu_local);
638 } 642 }
639
640#undef P 643#undef P
641#endif 644
642 spin_lock_irqsave(&sched_debug_lock, flags); 645 spin_lock_irqsave(&sched_debug_lock, flags);
643 print_cfs_stats(m, cpu); 646 print_cfs_stats(m, cpu);
644 print_rt_stats(m, cpu); 647 print_rt_stats(m, cpu);
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
868 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 871 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
869#define P(F) \ 872#define P(F) \
870 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 873 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
874#define P_SCHEDSTAT(F) \
875 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
871#define __PN(F) \ 876#define __PN(F) \
872 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 877 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
873#define PN(F) \ 878#define PN(F) \
874 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 879 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
880#define PN_SCHEDSTAT(F) \
881 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
875 882
876 PN(se.exec_start); 883 PN(se.exec_start);
877 PN(se.vruntime); 884 PN(se.vruntime);
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
881 888
882 P(se.nr_migrations); 889 P(se.nr_migrations);
883 890
884#ifdef CONFIG_SCHEDSTATS
885 if (schedstat_enabled()) { 891 if (schedstat_enabled()) {
886 u64 avg_atom, avg_per_cpu; 892 u64 avg_atom, avg_per_cpu;
887 893
888 PN(se.statistics.sum_sleep_runtime); 894 PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
889 PN(se.statistics.wait_start); 895 PN_SCHEDSTAT(se.statistics.wait_start);
890 PN(se.statistics.sleep_start); 896 PN_SCHEDSTAT(se.statistics.sleep_start);
891 PN(se.statistics.block_start); 897 PN_SCHEDSTAT(se.statistics.block_start);
892 PN(se.statistics.sleep_max); 898 PN_SCHEDSTAT(se.statistics.sleep_max);
893 PN(se.statistics.block_max); 899 PN_SCHEDSTAT(se.statistics.block_max);
894 PN(se.statistics.exec_max); 900 PN_SCHEDSTAT(se.statistics.exec_max);
895 PN(se.statistics.slice_max); 901 PN_SCHEDSTAT(se.statistics.slice_max);
896 PN(se.statistics.wait_max); 902 PN_SCHEDSTAT(se.statistics.wait_max);
897 PN(se.statistics.wait_sum); 903 PN_SCHEDSTAT(se.statistics.wait_sum);
898 P(se.statistics.wait_count); 904 P_SCHEDSTAT(se.statistics.wait_count);
899 PN(se.statistics.iowait_sum); 905 PN_SCHEDSTAT(se.statistics.iowait_sum);
900 P(se.statistics.iowait_count); 906 P_SCHEDSTAT(se.statistics.iowait_count);
901 P(se.statistics.nr_migrations_cold); 907 P_SCHEDSTAT(se.statistics.nr_migrations_cold);
902 P(se.statistics.nr_failed_migrations_affine); 908 P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
903 P(se.statistics.nr_failed_migrations_running); 909 P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
904 P(se.statistics.nr_failed_migrations_hot); 910 P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
905 P(se.statistics.nr_forced_migrations); 911 P_SCHEDSTAT(se.statistics.nr_forced_migrations);
906 P(se.statistics.nr_wakeups); 912 P_SCHEDSTAT(se.statistics.nr_wakeups);
907 P(se.statistics.nr_wakeups_sync); 913 P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
908 P(se.statistics.nr_wakeups_migrate); 914 P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
909 P(se.statistics.nr_wakeups_local); 915 P_SCHEDSTAT(se.statistics.nr_wakeups_local);
910 P(se.statistics.nr_wakeups_remote); 916 P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
911 P(se.statistics.nr_wakeups_affine); 917 P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
912 P(se.statistics.nr_wakeups_affine_attempts); 918 P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
913 P(se.statistics.nr_wakeups_passive); 919 P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
914 P(se.statistics.nr_wakeups_idle); 920 P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
915 921
916 avg_atom = p->se.sum_exec_runtime; 922 avg_atom = p->se.sum_exec_runtime;
917 if (nr_switches) 923 if (nr_switches)
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
930 __PN(avg_atom); 936 __PN(avg_atom);
931 __PN(avg_per_cpu); 937 __PN(avg_per_cpu);
932 } 938 }
933#endif 939
934 __P(nr_switches); 940 __P(nr_switches);
935 SEQ_printf(m, "%-45s:%21Ld\n", 941 SEQ_printf(m, "%-45s:%21Ld\n",
936 "nr_voluntary_switches", (long long)p->nvcsw); 942 "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
947#endif 953#endif
948 P(policy); 954 P(policy);
949 P(prio); 955 P(prio);
956#undef PN_SCHEDSTAT
950#undef PN 957#undef PN
951#undef __PN 958#undef __PN
959#undef P_SCHEDSTAT
952#undef P 960#undef P
953#undef __P 961#undef __P
954 962
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif 115#endif
116 116
117/*
118 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin
120 */
121unsigned int capacity_margin = 1280; /* ~20% */
122
117static inline void update_load_add(struct load_weight *lw, unsigned long inc) 123static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118{ 124{
119 lw->weight += inc; 125 lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
256 262
257static inline struct task_struct *task_of(struct sched_entity *se) 263static inline struct task_struct *task_of(struct sched_entity *se)
258{ 264{
259#ifdef CONFIG_SCHED_DEBUG 265 SCHED_WARN_ON(!entity_is_task(se));
260 WARN_ON_ONCE(!entity_is_task(se));
261#endif
262 return container_of(se, struct task_struct, se); 266 return container_of(se, struct task_struct, se);
263} 267}
264 268
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
456 460
457static void update_min_vruntime(struct cfs_rq *cfs_rq) 461static void update_min_vruntime(struct cfs_rq *cfs_rq)
458{ 462{
463 struct sched_entity *curr = cfs_rq->curr;
464
459 u64 vruntime = cfs_rq->min_vruntime; 465 u64 vruntime = cfs_rq->min_vruntime;
460 466
461 if (cfs_rq->curr) 467 if (curr) {
462 vruntime = cfs_rq->curr->vruntime; 468 if (curr->on_rq)
469 vruntime = curr->vruntime;
470 else
471 curr = NULL;
472 }
463 473
464 if (cfs_rq->rb_leftmost) { 474 if (cfs_rq->rb_leftmost) {
465 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, 475 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 struct sched_entity, 476 struct sched_entity,
467 run_node); 477 run_node);
468 478
469 if (!cfs_rq->curr) 479 if (!curr)
470 vruntime = se->vruntime; 480 vruntime = se->vruntime;
471 else 481 else
472 vruntime = min_vruntime(vruntime, se->vruntime); 482 vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
656} 666}
657 667
658#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
659static int select_idle_sibling(struct task_struct *p, int cpu); 669static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
660static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
661 671
662/* 672/*
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
680 * will definitely be update (after enqueue). 690 * will definitely be update (after enqueue).
681 */ 691 */
682 sa->period_contrib = 1023; 692 sa->period_contrib = 1023;
683 sa->load_avg = scale_load_down(se->load.weight); 693 /*
694 * Tasks are intialized with full load to be seen as heavy tasks until
695 * they get a chance to stabilize to their real load level.
696 * Group entities are intialized with zero load to reflect the fact that
697 * nothing has been attached to the task group yet.
698 */
699 if (entity_is_task(se))
700 sa->load_avg = scale_load_down(se->load.weight);
684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 701 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
685 /* 702 /*
686 * At this point, util_avg won't be used in select_task_rq_fair anyway 703 * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
726 struct sched_avg *sa = &se->avg; 743 struct sched_avg *sa = &se->avg;
727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 744 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq); 745 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
730 746
731 if (cap > 0) { 747 if (cap > 0) {
732 if (cfs_rq->avg.util_avg != 0) { 748 if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
759 } 775 }
760 } 776 }
761 777
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 778 update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se); 779 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update) 780 update_tg_load_avg(cfs_rq, false);
765 update_tg_load_avg(cfs_rq, false);
766} 781}
767 782
768#else /* !CONFIG_SMP */ 783#else /* !CONFIG_SMP */
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
799 max(delta_exec, curr->statistics.exec_max)); 814 max(delta_exec, curr->statistics.exec_max));
800 815
801 curr->sum_exec_runtime += delta_exec; 816 curr->sum_exec_runtime += delta_exec;
802 schedstat_add(cfs_rq, exec_clock, delta_exec); 817 schedstat_add(cfs_rq->exec_clock, delta_exec);
803 818
804 curr->vruntime += calc_delta_fair(delta_exec, curr); 819 curr->vruntime += calc_delta_fair(delta_exec, curr);
805 update_min_vruntime(cfs_rq); 820 update_min_vruntime(cfs_rq);
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq)
820 update_curr(cfs_rq_of(&rq->curr->se)); 835 update_curr(cfs_rq_of(&rq->curr->se));
821} 836}
822 837
823#ifdef CONFIG_SCHEDSTATS
824static inline void 838static inline void
825update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 839update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
826{ 840{
827 u64 wait_start = rq_clock(rq_of(cfs_rq)); 841 u64 wait_start, prev_wait_start;
842
843 if (!schedstat_enabled())
844 return;
845
846 wait_start = rq_clock(rq_of(cfs_rq));
847 prev_wait_start = schedstat_val(se->statistics.wait_start);
828 848
829 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && 849 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
830 likely(wait_start > se->statistics.wait_start)) 850 likely(wait_start > prev_wait_start))
831 wait_start -= se->statistics.wait_start; 851 wait_start -= prev_wait_start;
832 852
833 se->statistics.wait_start = wait_start; 853 schedstat_set(se->statistics.wait_start, wait_start);
834} 854}
835 855
836static void 856static inline void
837update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 857update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
838{ 858{
839 struct task_struct *p; 859 struct task_struct *p;
840 u64 delta; 860 u64 delta;
841 861
842 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 862 if (!schedstat_enabled())
863 return;
864
865 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
843 866
844 if (entity_is_task(se)) { 867 if (entity_is_task(se)) {
845 p = task_of(se); 868 p = task_of(se);
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
849 * time stamp can be adjusted to accumulate wait time 872 * time stamp can be adjusted to accumulate wait time
850 * prior to migration. 873 * prior to migration.
851 */ 874 */
852 se->statistics.wait_start = delta; 875 schedstat_set(se->statistics.wait_start, delta);
853 return; 876 return;
854 } 877 }
855 trace_sched_stat_wait(p, delta); 878 trace_sched_stat_wait(p, delta);
856 } 879 }
857 880
858 se->statistics.wait_max = max(se->statistics.wait_max, delta); 881 schedstat_set(se->statistics.wait_max,
859 se->statistics.wait_count++; 882 max(schedstat_val(se->statistics.wait_max), delta));
860 se->statistics.wait_sum += delta; 883 schedstat_inc(se->statistics.wait_count);
861 se->statistics.wait_start = 0; 884 schedstat_add(se->statistics.wait_sum, delta);
885 schedstat_set(se->statistics.wait_start, 0);
886}
887
888static inline void
889update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
890{
891 struct task_struct *tsk = NULL;
892 u64 sleep_start, block_start;
893
894 if (!schedstat_enabled())
895 return;
896
897 sleep_start = schedstat_val(se->statistics.sleep_start);
898 block_start = schedstat_val(se->statistics.block_start);
899
900 if (entity_is_task(se))
901 tsk = task_of(se);
902
903 if (sleep_start) {
904 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
905
906 if ((s64)delta < 0)
907 delta = 0;
908
909 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
910 schedstat_set(se->statistics.sleep_max, delta);
911
912 schedstat_set(se->statistics.sleep_start, 0);
913 schedstat_add(se->statistics.sum_sleep_runtime, delta);
914
915 if (tsk) {
916 account_scheduler_latency(tsk, delta >> 10, 1);
917 trace_sched_stat_sleep(tsk, delta);
918 }
919 }
920 if (block_start) {
921 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
922
923 if ((s64)delta < 0)
924 delta = 0;
925
926 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
927 schedstat_set(se->statistics.block_max, delta);
928
929 schedstat_set(se->statistics.block_start, 0);
930 schedstat_add(se->statistics.sum_sleep_runtime, delta);
931
932 if (tsk) {
933 if (tsk->in_iowait) {
934 schedstat_add(se->statistics.iowait_sum, delta);
935 schedstat_inc(se->statistics.iowait_count);
936 trace_sched_stat_iowait(tsk, delta);
937 }
938
939 trace_sched_stat_blocked(tsk, delta);
940
941 /*
942 * Blocking time is in units of nanosecs, so shift by
943 * 20 to get a milliseconds-range estimation of the
944 * amount of time that the task spent sleeping:
945 */
946 if (unlikely(prof_on == SLEEP_PROFILING)) {
947 profile_hits(SLEEP_PROFILING,
948 (void *)get_wchan(tsk),
949 delta >> 20);
950 }
951 account_scheduler_latency(tsk, delta >> 10, 0);
952 }
953 }
862} 954}
863 955
864/* 956/*
865 * Task is being enqueued - update stats: 957 * Task is being enqueued - update stats:
866 */ 958 */
867static inline void 959static inline void
868update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 960update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
869{ 961{
962 if (!schedstat_enabled())
963 return;
964
870 /* 965 /*
871 * Are we enqueueing a waiting task? (for current tasks 966 * Are we enqueueing a waiting task? (for current tasks
872 * a dequeue/enqueue event is a NOP) 967 * a dequeue/enqueue event is a NOP)
873 */ 968 */
874 if (se != cfs_rq->curr) 969 if (se != cfs_rq->curr)
875 update_stats_wait_start(cfs_rq, se); 970 update_stats_wait_start(cfs_rq, se);
971
972 if (flags & ENQUEUE_WAKEUP)
973 update_stats_enqueue_sleeper(cfs_rq, se);
876} 974}
877 975
878static inline void 976static inline void
879update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 977update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
880{ 978{
979
980 if (!schedstat_enabled())
981 return;
982
881 /* 983 /*
882 * Mark the end of the wait period if dequeueing a 984 * Mark the end of the wait period if dequeueing a
883 * waiting task: 985 * waiting task:
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
885 if (se != cfs_rq->curr) 987 if (se != cfs_rq->curr)
886 update_stats_wait_end(cfs_rq, se); 988 update_stats_wait_end(cfs_rq, se);
887 989
888 if (flags & DEQUEUE_SLEEP) { 990 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
889 if (entity_is_task(se)) { 991 struct task_struct *tsk = task_of(se);
890 struct task_struct *tsk = task_of(se);
891 992
892 if (tsk->state & TASK_INTERRUPTIBLE) 993 if (tsk->state & TASK_INTERRUPTIBLE)
893 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); 994 schedstat_set(se->statistics.sleep_start,
894 if (tsk->state & TASK_UNINTERRUPTIBLE) 995 rq_clock(rq_of(cfs_rq)));
895 se->statistics.block_start = rq_clock(rq_of(cfs_rq)); 996 if (tsk->state & TASK_UNINTERRUPTIBLE)
896 } 997 schedstat_set(se->statistics.block_start,
998 rq_clock(rq_of(cfs_rq)));
897 } 999 }
898
899}
900#else
901static inline void
902update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
903{
904} 1000}
905 1001
906static inline void
907update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
908{
909}
910
911static inline void
912update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
913{
914}
915
916static inline void
917update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
918{
919}
920#endif
921
922/* 1002/*
923 * We are picking a new current task - update its stats: 1003 * We are picking a new current task - update its stats:
924 */ 1004 */
@@ -1513,8 +1593,16 @@ balance:
1513 * One idle CPU per node is evaluated for a task numa move. 1593 * One idle CPU per node is evaluated for a task numa move.
1514 * Call select_idle_sibling to maybe find a better one. 1594 * Call select_idle_sibling to maybe find a better one.
1515 */ 1595 */
1516 if (!cur) 1596 if (!cur) {
1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); 1597 /*
1598 * select_idle_siblings() uses an per-cpu cpumask that
1599 * can be used from IRQ context.
1600 */
1601 local_irq_disable();
1602 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1603 env->dst_cpu);
1604 local_irq_enable();
1605 }
1518 1606
1519assign: 1607assign:
1520 task_numa_assign(env, cur, imp); 1608 task_numa_assign(env, cur, imp);
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work)
2292 unsigned long nr_pte_updates = 0; 2380 unsigned long nr_pte_updates = 0;
2293 long pages, virtpages; 2381 long pages, virtpages;
2294 2382
2295 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 2383 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2296 2384
2297 work->next = work; /* protect against double add */ 2385 work->next = work; /* protect against double add */
2298 /* 2386 /*
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2803} 2891}
2804 2892
2805#ifdef CONFIG_FAIR_GROUP_SCHED 2893#ifdef CONFIG_FAIR_GROUP_SCHED
2806/* 2894/**
2807 * Updating tg's load_avg is necessary before update_cfs_share (which is done) 2895 * update_tg_load_avg - update the tg's load avg
2808 * and effective_load (which is not done because it is too costly). 2896 * @cfs_rq: the cfs_rq whose avg changed
2897 * @force: update regardless of how small the difference
2898 *
2899 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2900 * However, because tg->load_avg is a global value there are performance
2901 * considerations.
2902 *
2903 * In order to avoid having to look at the other cfs_rq's, we use a
2904 * differential update where we store the last value we propagated. This in
2905 * turn allows skipping updates if the differential is 'small'.
2906 *
2907 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2908 * done) and effective_load() (which is not done because it is too costly).
2809 */ 2909 */
2810static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 2910static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811{ 2911{
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2875 2975
2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2976static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2877{ 2977{
2878 struct rq *rq = rq_of(cfs_rq); 2978 if (&this_rq()->cfs == cfs_rq) {
2879 int cpu = cpu_of(rq);
2880
2881 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2882 unsigned long max = rq->cpu_capacity_orig;
2883
2884 /* 2979 /*
2885 * There are a few boundary cases this might miss but it should 2980 * There are a few boundary cases this might miss but it should
2886 * get called often enough that that should (hopefully) not be 2981 * get called often enough that that should (hopefully) not be
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2897 * 2992 *
2898 * See cpu_util(). 2993 * See cpu_util().
2899 */ 2994 */
2900 cpufreq_update_util(rq_clock(rq), 2995 cpufreq_update_util(rq_of(cfs_rq), 0);
2901 min(cfs_rq->avg.util_avg, max), max);
2902 } 2996 }
2903} 2997}
2904 2998
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2931 * 3025 *
2932 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3026 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2933 * 3027 *
2934 * Returns true if the load decayed or we removed utilization. It is expected 3028 * Returns true if the load decayed or we removed load.
2935 * that one calls update_tg_load_avg() on this condition, but after you've 3029 *
2936 * modified the cfs_rq avg (attach/detach), such that we propagate the new 3030 * Since both these conditions indicate a changed cfs_rq->avg.load we should
2937 * avg up. 3031 * call update_tg_load_avg() when this function returns true.
2938 */ 3032 */
2939static inline int 3033static inline int
2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 3034update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3159 3253
3160static inline void update_load_avg(struct sched_entity *se, int not_used) 3254static inline void update_load_avg(struct sched_entity *se, int not_used)
3161{ 3255{
3162 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3256 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
3163 struct rq *rq = rq_of(cfs_rq);
3164
3165 cpufreq_trigger_update(rq_clock(rq));
3166} 3257}
3167 3258
3168static inline void 3259static inline void
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq)
3183 3274
3184#endif /* CONFIG_SMP */ 3275#endif /* CONFIG_SMP */
3185 3276
3186static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3187{
3188#ifdef CONFIG_SCHEDSTATS
3189 struct task_struct *tsk = NULL;
3190
3191 if (entity_is_task(se))
3192 tsk = task_of(se);
3193
3194 if (se->statistics.sleep_start) {
3195 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3196
3197 if ((s64)delta < 0)
3198 delta = 0;
3199
3200 if (unlikely(delta > se->statistics.sleep_max))
3201 se->statistics.sleep_max = delta;
3202
3203 se->statistics.sleep_start = 0;
3204 se->statistics.sum_sleep_runtime += delta;
3205
3206 if (tsk) {
3207 account_scheduler_latency(tsk, delta >> 10, 1);
3208 trace_sched_stat_sleep(tsk, delta);
3209 }
3210 }
3211 if (se->statistics.block_start) {
3212 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3213
3214 if ((s64)delta < 0)
3215 delta = 0;
3216
3217 if (unlikely(delta > se->statistics.block_max))
3218 se->statistics.block_max = delta;
3219
3220 se->statistics.block_start = 0;
3221 se->statistics.sum_sleep_runtime += delta;
3222
3223 if (tsk) {
3224 if (tsk->in_iowait) {
3225 se->statistics.iowait_sum += delta;
3226 se->statistics.iowait_count++;
3227 trace_sched_stat_iowait(tsk, delta);
3228 }
3229
3230 trace_sched_stat_blocked(tsk, delta);
3231
3232 /*
3233 * Blocking time is in units of nanosecs, so shift by
3234 * 20 to get a milliseconds-range estimation of the
3235 * amount of time that the task spent sleeping:
3236 */
3237 if (unlikely(prof_on == SLEEP_PROFILING)) {
3238 profile_hits(SLEEP_PROFILING,
3239 (void *)get_wchan(tsk),
3240 delta >> 20);
3241 }
3242 account_scheduler_latency(tsk, delta >> 10, 0);
3243 }
3244 }
3245#endif
3246}
3247
3248static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 3277static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3249{ 3278{
3250#ifdef CONFIG_SCHED_DEBUG 3279#ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3254 d = -d; 3283 d = -d;
3255 3284
3256 if (d > 3*sysctl_sched_latency) 3285 if (d > 3*sysctl_sched_latency)
3257 schedstat_inc(cfs_rq, nr_spread_over); 3286 schedstat_inc(cfs_rq->nr_spread_over);
3258#endif 3287#endif
3259} 3288}
3260 3289
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3371 account_entity_enqueue(cfs_rq, se); 3400 account_entity_enqueue(cfs_rq, se);
3372 update_cfs_shares(cfs_rq); 3401 update_cfs_shares(cfs_rq);
3373 3402
3374 if (flags & ENQUEUE_WAKEUP) { 3403 if (flags & ENQUEUE_WAKEUP)
3375 place_entity(cfs_rq, se, 0); 3404 place_entity(cfs_rq, se, 0);
3376 if (schedstat_enabled())
3377 enqueue_sleeper(cfs_rq, se);
3378 }
3379 3405
3380 check_schedstat_required(); 3406 check_schedstat_required();
3381 if (schedstat_enabled()) { 3407 update_stats_enqueue(cfs_rq, se, flags);
3382 update_stats_enqueue(cfs_rq, se); 3408 check_spread(cfs_rq, se);
3383 check_spread(cfs_rq, se);
3384 }
3385 if (!curr) 3409 if (!curr)
3386 __enqueue_entity(cfs_rq, se); 3410 __enqueue_entity(cfs_rq, se);
3387 se->on_rq = 1; 3411 se->on_rq = 1;
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3448 update_curr(cfs_rq); 3472 update_curr(cfs_rq);
3449 dequeue_entity_load_avg(cfs_rq, se); 3473 dequeue_entity_load_avg(cfs_rq, se);
3450 3474
3451 if (schedstat_enabled()) 3475 update_stats_dequeue(cfs_rq, se, flags);
3452 update_stats_dequeue(cfs_rq, se, flags);
3453 3476
3454 clear_buddies(cfs_rq, se); 3477 clear_buddies(cfs_rq, se);
3455 3478
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3459 account_entity_dequeue(cfs_rq, se); 3482 account_entity_dequeue(cfs_rq, se);
3460 3483
3461 /* 3484 /*
3462 * Normalize the entity after updating the min_vruntime because the 3485 * Normalize after update_curr(); which will also have moved
3463 * update can refer to the ->curr item and we need to reflect this 3486 * min_vruntime if @se is the one holding it back. But before doing
3464 * movement in our normalized position. 3487 * update_min_vruntime() again, which will discount @se's position and
3488 * can move min_vruntime forward still more.
3465 */ 3489 */
3466 if (!(flags & DEQUEUE_SLEEP)) 3490 if (!(flags & DEQUEUE_SLEEP))
3467 se->vruntime -= cfs_rq->min_vruntime; 3491 se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3469 /* return excess runtime on last dequeue */ 3493 /* return excess runtime on last dequeue */
3470 return_cfs_rq_runtime(cfs_rq); 3494 return_cfs_rq_runtime(cfs_rq);
3471 3495
3472 update_min_vruntime(cfs_rq);
3473 update_cfs_shares(cfs_rq); 3496 update_cfs_shares(cfs_rq);
3497
3498 /*
3499 * Now advance min_vruntime if @se was the entity holding it back,
3500 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3501 * put back on, and if we advance min_vruntime, we'll be placed back
3502 * further than we started -- ie. we'll be penalized.
3503 */
3504 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
3505 update_min_vruntime(cfs_rq);
3474} 3506}
3475 3507
3476/* 3508/*
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3523 * a CPU. So account for the time it spent waiting on the 3555 * a CPU. So account for the time it spent waiting on the
3524 * runqueue. 3556 * runqueue.
3525 */ 3557 */
3526 if (schedstat_enabled()) 3558 update_stats_wait_end(cfs_rq, se);
3527 update_stats_wait_end(cfs_rq, se);
3528 __dequeue_entity(cfs_rq, se); 3559 __dequeue_entity(cfs_rq, se);
3529 update_load_avg(se, 1); 3560 update_load_avg(se, 1);
3530 } 3561 }
3531 3562
3532 update_stats_curr_start(cfs_rq, se); 3563 update_stats_curr_start(cfs_rq, se);
3533 cfs_rq->curr = se; 3564 cfs_rq->curr = se;
3534#ifdef CONFIG_SCHEDSTATS 3565
3535 /* 3566 /*
3536 * Track our maximum slice length, if the CPU's load is at 3567 * Track our maximum slice length, if the CPU's load is at
3537 * least twice that of our own weight (i.e. dont track it 3568 * least twice that of our own weight (i.e. dont track it
3538 * when there are only lesser-weight tasks around): 3569 * when there are only lesser-weight tasks around):
3539 */ 3570 */
3540 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3571 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3541 se->statistics.slice_max = max(se->statistics.slice_max, 3572 schedstat_set(se->statistics.slice_max,
3542 se->sum_exec_runtime - se->prev_sum_exec_runtime); 3573 max((u64)schedstat_val(se->statistics.slice_max),
3574 se->sum_exec_runtime - se->prev_sum_exec_runtime));
3543 } 3575 }
3544#endif 3576
3545 se->prev_sum_exec_runtime = se->sum_exec_runtime; 3577 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3546} 3578}
3547 3579
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3620 /* throttle cfs_rqs exceeding runtime */ 3652 /* throttle cfs_rqs exceeding runtime */
3621 check_cfs_rq_runtime(cfs_rq); 3653 check_cfs_rq_runtime(cfs_rq);
3622 3654
3623 if (schedstat_enabled()) { 3655 check_spread(cfs_rq, prev);
3624 check_spread(cfs_rq, prev);
3625 if (prev->on_rq)
3626 update_stats_wait_start(cfs_rq, prev);
3627 }
3628 3656
3629 if (prev->on_rq) { 3657 if (prev->on_rq) {
3658 update_stats_wait_start(cfs_rq, prev);
3630 /* Put 'current' back into the tree. */ 3659 /* Put 'current' back into the tree. */
3631 __enqueue_entity(cfs_rq, prev); 3660 __enqueue_entity(cfs_rq, prev);
3632 /* in !on_rq case, update occurred at dequeue */ 3661 /* in !on_rq case, update occurred at dequeue */
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4456 struct sched_entity *se = &p->se; 4485 struct sched_entity *se = &p->se;
4457 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4486 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4458 4487
4459 WARN_ON(task_rq(p) != rq); 4488 SCHED_WARN_ON(task_rq(p) != rq);
4460 4489
4461 if (cfs_rq->nr_running > 1) { 4490 if (rq->cfs.h_nr_running > 1) {
4462 u64 slice = sched_slice(cfs_rq, se); 4491 u64 slice = sched_slice(cfs_rq, se);
4463 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 4492 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4464 s64 delta = slice - ran; 4493 s64 delta = slice - ran;
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4509 struct cfs_rq *cfs_rq; 4538 struct cfs_rq *cfs_rq;
4510 struct sched_entity *se = &p->se; 4539 struct sched_entity *se = &p->se;
4511 4540
4541 /*
4542 * If in_iowait is set, the code below may not trigger any cpufreq
4543 * utilization updates, so do it here explicitly with the IOWAIT flag
4544 * passed.
4545 */
4546 if (p->in_iowait)
4547 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
4548
4512 for_each_sched_entity(se) { 4549 for_each_sched_entity(se) {
4513 if (se->on_rq) 4550 if (se->on_rq)
4514 break; 4551 break;
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4605} 4642}
4606 4643
4607#ifdef CONFIG_SMP 4644#ifdef CONFIG_SMP
4645
4646/* Working cpumask for: load_balance, load_balance_newidle. */
4647DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4648DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4649
4608#ifdef CONFIG_NO_HZ_COMMON 4650#ifdef CONFIG_NO_HZ_COMMON
4609/* 4651/*
4610 * per rq 'load' arrray crap; XXX kill this. 4652 * per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5006 * wl = S * s'_i; see (2) 5048 * wl = S * s'_i; see (2)
5007 */ 5049 */
5008 if (W > 0 && w < W) 5050 if (W > 0 && w < W)
5009 wl = (w * (long)tg->shares) / W; 5051 wl = (w * (long)scale_load_down(tg->shares)) / W;
5010 else 5052 else
5011 wl = tg->shares; 5053 wl = scale_load_down(tg->shares);
5012 5054
5013 /* 5055 /*
5014 * Per the above, wl is the new se->load.weight value; since 5056 * Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p)
5091 return 1; 5133 return 1;
5092} 5134}
5093 5135
5094static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 5136static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5137 int prev_cpu, int sync)
5095{ 5138{
5096 s64 this_load, load; 5139 s64 this_load, load;
5097 s64 this_eff_load, prev_eff_load; 5140 s64 this_eff_load, prev_eff_load;
5098 int idx, this_cpu, prev_cpu; 5141 int idx, this_cpu;
5099 struct task_group *tg; 5142 struct task_group *tg;
5100 unsigned long weight; 5143 unsigned long weight;
5101 int balanced; 5144 int balanced;
5102 5145
5103 idx = sd->wake_idx; 5146 idx = sd->wake_idx;
5104 this_cpu = smp_processor_id(); 5147 this_cpu = smp_processor_id();
5105 prev_cpu = task_cpu(p);
5106 load = source_load(prev_cpu, idx); 5148 load = source_load(prev_cpu, idx);
5107 this_load = target_load(this_cpu, idx); 5149 this_load = target_load(this_cpu, idx);
5108 5150
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5146 5188
5147 balanced = this_eff_load <= prev_eff_load; 5189 balanced = this_eff_load <= prev_eff_load;
5148 5190
5149 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 5191 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5150 5192
5151 if (!balanced) 5193 if (!balanced)
5152 return 0; 5194 return 0;
5153 5195
5154 schedstat_inc(sd, ttwu_move_affine); 5196 schedstat_inc(sd->ttwu_move_affine);
5155 schedstat_inc(p, se.statistics.nr_wakeups_affine); 5197 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5156 5198
5157 return 1; 5199 return 1;
5158} 5200}
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5228 int shallowest_idle_cpu = -1; 5270 int shallowest_idle_cpu = -1;
5229 int i; 5271 int i;
5230 5272
5273 /* Check if we have any choice: */
5274 if (group->group_weight == 1)
5275 return cpumask_first(sched_group_cpus(group));
5276
5231 /* Traverse only the allowed CPUs */ 5277 /* Traverse only the allowed CPUs */
5232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 5278 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5233 if (idle_cpu(i)) { 5279 if (idle_cpu(i)) {
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5265} 5311}
5266 5312
5267/* 5313/*
5268 * Try and locate an idle CPU in the sched_domain. 5314 * Implement a for_each_cpu() variant that starts the scan at a given cpu
5315 * (@start), and wraps around.
5316 *
5317 * This is used to scan for idle CPUs; such that not all CPUs looking for an
5318 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
5319 * through the LLC domain.
5320 *
5321 * Especially tbench is found sensitive to this.
5322 */
5323
5324static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5325{
5326 int next;
5327
5328again:
5329 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5330
5331 if (*wrapped) {
5332 if (next >= start)
5333 return nr_cpumask_bits;
5334 } else {
5335 if (next >= nr_cpumask_bits) {
5336 *wrapped = 1;
5337 n = -1;
5338 goto again;
5339 }
5340 }
5341
5342 return next;
5343}
5344
5345#define for_each_cpu_wrap(cpu, mask, start, wrap) \
5346 for ((wrap) = 0, (cpu) = (start)-1; \
5347 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5348 (cpu) < nr_cpumask_bits; )
5349
5350#ifdef CONFIG_SCHED_SMT
5351
5352static inline void set_idle_cores(int cpu, int val)
5353{
5354 struct sched_domain_shared *sds;
5355
5356 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5357 if (sds)
5358 WRITE_ONCE(sds->has_idle_cores, val);
5359}
5360
5361static inline bool test_idle_cores(int cpu, bool def)
5362{
5363 struct sched_domain_shared *sds;
5364
5365 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5366 if (sds)
5367 return READ_ONCE(sds->has_idle_cores);
5368
5369 return def;
5370}
5371
5372/*
5373 * Scans the local SMT mask to see if the entire core is idle, and records this
5374 * information in sd_llc_shared->has_idle_cores.
5375 *
5376 * Since SMT siblings share all cache levels, inspecting this limited remote
5377 * state should be fairly cheap.
5378 */
5379void __update_idle_core(struct rq *rq)
5380{
5381 int core = cpu_of(rq);
5382 int cpu;
5383
5384 rcu_read_lock();
5385 if (test_idle_cores(core, true))
5386 goto unlock;
5387
5388 for_each_cpu(cpu, cpu_smt_mask(core)) {
5389 if (cpu == core)
5390 continue;
5391
5392 if (!idle_cpu(cpu))
5393 goto unlock;
5394 }
5395
5396 set_idle_cores(core, 1);
5397unlock:
5398 rcu_read_unlock();
5399}
5400
5401/*
5402 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5403 * there are no idle cores left in the system; tracked through
5404 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5405 */
5406static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5407{
5408 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5409 int core, cpu, wrap;
5410
5411 if (!static_branch_likely(&sched_smt_present))
5412 return -1;
5413
5414 if (!test_idle_cores(target, false))
5415 return -1;
5416
5417 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
5418
5419 for_each_cpu_wrap(core, cpus, target, wrap) {
5420 bool idle = true;
5421
5422 for_each_cpu(cpu, cpu_smt_mask(core)) {
5423 cpumask_clear_cpu(cpu, cpus);
5424 if (!idle_cpu(cpu))
5425 idle = false;
5426 }
5427
5428 if (idle)
5429 return core;
5430 }
5431
5432 /*
5433 * Failed to find an idle core; stop looking for one.
5434 */
5435 set_idle_cores(target, 0);
5436
5437 return -1;
5438}
5439
5440/*
5441 * Scan the local SMT mask for idle CPUs.
5442 */
5443static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5444{
5445 int cpu;
5446
5447 if (!static_branch_likely(&sched_smt_present))
5448 return -1;
5449
5450 for_each_cpu(cpu, cpu_smt_mask(target)) {
5451 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5452 continue;
5453 if (idle_cpu(cpu))
5454 return cpu;
5455 }
5456
5457 return -1;
5458}
5459
5460#else /* CONFIG_SCHED_SMT */
5461
5462static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5463{
5464 return -1;
5465}
5466
5467static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5468{
5469 return -1;
5470}
5471
5472#endif /* CONFIG_SCHED_SMT */
5473
5474/*
5475 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5476 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5477 * average idle time for this rq (as found in rq->avg_idle).
5478 */
5479static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5480{
5481 struct sched_domain *this_sd;
5482 u64 avg_cost, avg_idle = this_rq()->avg_idle;
5483 u64 time, cost;
5484 s64 delta;
5485 int cpu, wrap;
5486
5487 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5488 if (!this_sd)
5489 return -1;
5490
5491 avg_cost = this_sd->avg_scan_cost;
5492
5493 /*
5494 * Due to large variance we need a large fuzz factor; hackbench in
5495 * particularly is sensitive here.
5496 */
5497 if ((avg_idle / 512) < avg_cost)
5498 return -1;
5499
5500 time = local_clock();
5501
5502 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
5503 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5504 continue;
5505 if (idle_cpu(cpu))
5506 break;
5507 }
5508
5509 time = local_clock() - time;
5510 cost = this_sd->avg_scan_cost;
5511 delta = (s64)(time - cost) / 8;
5512 this_sd->avg_scan_cost += delta;
5513
5514 return cpu;
5515}
5516
5517/*
5518 * Try and locate an idle core/thread in the LLC cache domain.
5269 */ 5519 */
5270static int select_idle_sibling(struct task_struct *p, int target) 5520static int select_idle_sibling(struct task_struct *p, int prev, int target)
5271{ 5521{
5272 struct sched_domain *sd; 5522 struct sched_domain *sd;
5273 struct sched_group *sg; 5523 int i;
5274 int i = task_cpu(p);
5275 5524
5276 if (idle_cpu(target)) 5525 if (idle_cpu(target))
5277 return target; 5526 return target;
5278 5527
5279 /* 5528 /*
5280 * If the prevous cpu is cache affine and idle, don't be stupid. 5529 * If the previous cpu is cache affine and idle, don't be stupid.
5281 */ 5530 */
5282 if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) 5531 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5283 return i; 5532 return prev;
5284 5533
5285 /*
5286 * Otherwise, iterate the domains and find an eligible idle cpu.
5287 *
5288 * A completely idle sched group at higher domains is more
5289 * desirable than an idle group at a lower level, because lower
5290 * domains have smaller groups and usually share hardware
5291 * resources which causes tasks to contend on them, e.g. x86
5292 * hyperthread siblings in the lowest domain (SMT) can contend
5293 * on the shared cpu pipeline.
5294 *
5295 * However, while we prefer idle groups at higher domains
5296 * finding an idle cpu at the lowest domain is still better than
5297 * returning 'target', which we've already established, isn't
5298 * idle.
5299 */
5300 sd = rcu_dereference(per_cpu(sd_llc, target)); 5534 sd = rcu_dereference(per_cpu(sd_llc, target));
5301 for_each_lower_domain(sd) { 5535 if (!sd)
5302 sg = sd->groups; 5536 return target;
5303 do { 5537
5304 if (!cpumask_intersects(sched_group_cpus(sg), 5538 i = select_idle_core(p, sd, target);
5305 tsk_cpus_allowed(p))) 5539 if ((unsigned)i < nr_cpumask_bits)
5306 goto next; 5540 return i;
5307 5541
5308 /* Ensure the entire group is idle */ 5542 i = select_idle_cpu(p, sd, target);
5309 for_each_cpu(i, sched_group_cpus(sg)) { 5543 if ((unsigned)i < nr_cpumask_bits)
5310 if (i == target || !idle_cpu(i)) 5544 return i;
5311 goto next; 5545
5312 } 5546 i = select_idle_smt(p, sd, target);
5547 if ((unsigned)i < nr_cpumask_bits)
5548 return i;
5313 5549
5314 /*
5315 * It doesn't matter which cpu we pick, the
5316 * whole group is idle.
5317 */
5318 target = cpumask_first_and(sched_group_cpus(sg),
5319 tsk_cpus_allowed(p));
5320 goto done;
5321next:
5322 sg = sg->next;
5323 } while (sg != sd->groups);
5324 }
5325done:
5326 return target; 5550 return target;
5327} 5551}
5328 5552
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu)
5360 return (util >= capacity) ? capacity : util; 5584 return (util >= capacity) ? capacity : util;
5361} 5585}
5362 5586
5587static inline int task_util(struct task_struct *p)
5588{
5589 return p->se.avg.util_avg;
5590}
5591
5592/*
5593 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5594 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5595 *
5596 * In that case WAKE_AFFINE doesn't make sense and we'll let
5597 * BALANCE_WAKE sort things out.
5598 */
5599static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5600{
5601 long min_cap, max_cap;
5602
5603 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5604 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5605
5606 /* Minimum capacity is close to max, no need to abort wake_affine */
5607 if (max_cap - min_cap < max_cap >> 3)
5608 return 0;
5609
5610 return min_cap * 1024 < task_util(p) * capacity_margin;
5611}
5612
5363/* 5613/*
5364 * select_task_rq_fair: Select target runqueue for the waking task in domains 5614 * select_task_rq_fair: Select target runqueue for the waking task in domains
5365 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 5615 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5383 5633
5384 if (sd_flag & SD_BALANCE_WAKE) { 5634 if (sd_flag & SD_BALANCE_WAKE) {
5385 record_wakee(p); 5635 record_wakee(p);
5386 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 5636 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5637 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5387 } 5638 }
5388 5639
5389 rcu_read_lock(); 5640 rcu_read_lock();
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5409 5660
5410 if (affine_sd) { 5661 if (affine_sd) {
5411 sd = NULL; /* Prefer wake_affine over balance flags */ 5662 sd = NULL; /* Prefer wake_affine over balance flags */
5412 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 5663 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
5413 new_cpu = cpu; 5664 new_cpu = cpu;
5414 } 5665 }
5415 5666
5416 if (!sd) { 5667 if (!sd) {
5417 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 5668 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5418 new_cpu = select_idle_sibling(p, new_cpu); 5669 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
5419 5670
5420 } else while (sd) { 5671 } else while (sd) {
5421 struct sched_group *group; 5672 struct sched_group *group;
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5939 * 6190 *
5940 * The adjacency matrix of the resulting graph is given by: 6191 * The adjacency matrix of the resulting graph is given by:
5941 * 6192 *
5942 * log_2 n 6193 * log_2 n
5943 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) 6194 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5944 * k = 0 6195 * k = 0
5945 * 6196 *
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5985 * 6236 *
5986 * [XXX write more on how we solve this.. _after_ merging pjt's patches that 6237 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5987 * rewrite all of this once again.] 6238 * rewrite all of this once again.]
5988 */ 6239 */
5989 6240
5990static unsigned long __read_mostly max_load_balance_interval = HZ/10; 6241static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5991 6242
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6133 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 6384 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6134 int cpu; 6385 int cpu;
6135 6386
6136 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 6387 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
6137 6388
6138 env->flags |= LBF_SOME_PINNED; 6389 env->flags |= LBF_SOME_PINNED;
6139 6390
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6164 env->flags &= ~LBF_ALL_PINNED; 6415 env->flags &= ~LBF_ALL_PINNED;
6165 6416
6166 if (task_running(env->src_rq, p)) { 6417 if (task_running(env->src_rq, p)) {
6167 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 6418 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
6168 return 0; 6419 return 0;
6169 } 6420 }
6170 6421
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6181 if (tsk_cache_hot <= 0 || 6432 if (tsk_cache_hot <= 0 ||
6182 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 6433 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6183 if (tsk_cache_hot == 1) { 6434 if (tsk_cache_hot == 1) {
6184 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 6435 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
6185 schedstat_inc(p, se.statistics.nr_forced_migrations); 6436 schedstat_inc(p->se.statistics.nr_forced_migrations);
6186 } 6437 }
6187 return 1; 6438 return 1;
6188 } 6439 }
6189 6440
6190 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 6441 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
6191 return 0; 6442 return 0;
6192} 6443}
6193 6444
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
6227 * so we can safely collect stats here rather than 6478 * so we can safely collect stats here rather than
6228 * inside detach_tasks(). 6479 * inside detach_tasks().
6229 */ 6480 */
6230 schedstat_inc(env->sd, lb_gained[env->idle]); 6481 schedstat_inc(env->sd->lb_gained[env->idle]);
6231 return p; 6482 return p;
6232 } 6483 }
6233 return NULL; 6484 return NULL;
@@ -6319,7 +6570,7 @@ next:
6319 * so we can safely collect detach_one_task() stats here rather 6570 * so we can safely collect detach_one_task() stats here rather
6320 * than inside detach_one_task(). 6571 * than inside detach_one_task().
6321 */ 6572 */
6322 schedstat_add(env->sd, lb_gained[env->idle], detached); 6573 schedstat_add(env->sd->lb_gained[env->idle], detached);
6323 6574
6324 return detached; 6575 return detached;
6325} 6576}
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6647 /* 6898 /*
6648 * !SD_OVERLAP domains can assume that child groups 6899 * !SD_OVERLAP domains can assume that child groups
6649 * span the current group. 6900 * span the current group.
6650 */ 6901 */
6651 6902
6652 group = child->groups; 6903 group = child->groups;
6653 do { 6904 do {
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
7147 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; 7398 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7148 if (load_above_capacity > busiest->group_capacity) { 7399 if (load_above_capacity > busiest->group_capacity) {
7149 load_above_capacity -= busiest->group_capacity; 7400 load_above_capacity -= busiest->group_capacity;
7150 load_above_capacity *= NICE_0_LOAD; 7401 load_above_capacity *= scale_load_down(NICE_0_LOAD);
7151 load_above_capacity /= busiest->group_capacity; 7402 load_above_capacity /= busiest->group_capacity;
7152 } else 7403 } else
7153 load_above_capacity = ~0UL; 7404 load_above_capacity = ~0UL;
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
7354 */ 7605 */
7355#define MAX_PINNED_INTERVAL 512 7606#define MAX_PINNED_INTERVAL 512
7356 7607
7357/* Working cpumask for load_balance and load_balance_newidle. */
7358DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7359
7360static int need_active_balance(struct lb_env *env) 7608static int need_active_balance(struct lb_env *env)
7361{ 7609{
7362 struct sched_domain *sd = env->sd; 7610 struct sched_domain *sd = env->sd;
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
7460 7708
7461 cpumask_copy(cpus, cpu_active_mask); 7709 cpumask_copy(cpus, cpu_active_mask);
7462 7710
7463 schedstat_inc(sd, lb_count[idle]); 7711 schedstat_inc(sd->lb_count[idle]);
7464 7712
7465redo: 7713redo:
7466 if (!should_we_balance(&env)) { 7714 if (!should_we_balance(&env)) {
@@ -7470,19 +7718,19 @@ redo:
7470 7718
7471 group = find_busiest_group(&env); 7719 group = find_busiest_group(&env);
7472 if (!group) { 7720 if (!group) {
7473 schedstat_inc(sd, lb_nobusyg[idle]); 7721 schedstat_inc(sd->lb_nobusyg[idle]);
7474 goto out_balanced; 7722 goto out_balanced;
7475 } 7723 }
7476 7724
7477 busiest = find_busiest_queue(&env, group); 7725 busiest = find_busiest_queue(&env, group);
7478 if (!busiest) { 7726 if (!busiest) {
7479 schedstat_inc(sd, lb_nobusyq[idle]); 7727 schedstat_inc(sd->lb_nobusyq[idle]);
7480 goto out_balanced; 7728 goto out_balanced;
7481 } 7729 }
7482 7730
7483 BUG_ON(busiest == env.dst_rq); 7731 BUG_ON(busiest == env.dst_rq);
7484 7732
7485 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 7733 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
7486 7734
7487 env.src_cpu = busiest->cpu; 7735 env.src_cpu = busiest->cpu;
7488 env.src_rq = busiest; 7736 env.src_rq = busiest;
@@ -7589,7 +7837,7 @@ more_balance:
7589 } 7837 }
7590 7838
7591 if (!ld_moved) { 7839 if (!ld_moved) {
7592 schedstat_inc(sd, lb_failed[idle]); 7840 schedstat_inc(sd->lb_failed[idle]);
7593 /* 7841 /*
7594 * Increment the failure counter only on periodic balance. 7842 * Increment the failure counter only on periodic balance.
7595 * We do not want newidle balance, which can be very 7843 * We do not want newidle balance, which can be very
@@ -7672,7 +7920,7 @@ out_all_pinned:
7672 * we can't migrate them. Let the imbalance flag set so parent level 7920 * we can't migrate them. Let the imbalance flag set so parent level
7673 * can try to migrate them. 7921 * can try to migrate them.
7674 */ 7922 */
7675 schedstat_inc(sd, lb_balanced[idle]); 7923 schedstat_inc(sd->lb_balanced[idle]);
7676 7924
7677 sd->nr_balance_failed = 0; 7925 sd->nr_balance_failed = 0;
7678 7926
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7704} 7952}
7705 7953
7706static inline void 7954static inline void
7707update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) 7955update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
7708{ 7956{
7709 unsigned long interval, next; 7957 unsigned long interval, next;
7710 7958
7711 interval = get_sd_balance_interval(sd, cpu_busy); 7959 /* used by idle balance, so cpu_busy = 0 */
7960 interval = get_sd_balance_interval(sd, 0);
7712 next = sd->last_balance + interval; 7961 next = sd->last_balance + interval;
7713 7962
7714 if (time_after(*next_balance, next)) 7963 if (time_after(*next_balance, next))
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq)
7738 rcu_read_lock(); 7987 rcu_read_lock();
7739 sd = rcu_dereference_check_sched_domain(this_rq->sd); 7988 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7740 if (sd) 7989 if (sd)
7741 update_next_balance(sd, 0, &next_balance); 7990 update_next_balance(sd, &next_balance);
7742 rcu_read_unlock(); 7991 rcu_read_unlock();
7743 7992
7744 goto out; 7993 goto out;
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq)
7756 continue; 8005 continue;
7757 8006
7758 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { 8007 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7759 update_next_balance(sd, 0, &next_balance); 8008 update_next_balance(sd, &next_balance);
7760 break; 8009 break;
7761 } 8010 }
7762 8011
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq)
7774 curr_cost += domain_cost; 8023 curr_cost += domain_cost;
7775 } 8024 }
7776 8025
7777 update_next_balance(sd, 0, &next_balance); 8026 update_next_balance(sd, &next_balance);
7778 8027
7779 /* 8028 /*
7780 * Stop searching for tasks to pull if there are 8029 * Stop searching for tasks to pull if there are
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data)
7864 .idle = CPU_IDLE, 8113 .idle = CPU_IDLE,
7865 }; 8114 };
7866 8115
7867 schedstat_inc(sd, alb_count); 8116 schedstat_inc(sd->alb_count);
7868 8117
7869 p = detach_one_task(&env); 8118 p = detach_one_task(&env);
7870 if (p) { 8119 if (p) {
7871 schedstat_inc(sd, alb_pushed); 8120 schedstat_inc(sd->alb_pushed);
7872 /* Active balancing done, reset the failure counter. */ 8121 /* Active balancing done, reset the failure counter. */
7873 sd->nr_balance_failed = 0; 8122 sd->nr_balance_failed = 0;
7874 } else { 8123 } else {
7875 schedstat_inc(sd, alb_failed); 8124 schedstat_inc(sd->alb_failed);
7876 } 8125 }
7877 } 8126 }
7878 rcu_read_unlock(); 8127 rcu_read_unlock();
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void)
7964 int cpu = smp_processor_id(); 8213 int cpu = smp_processor_id();
7965 8214
7966 rcu_read_lock(); 8215 rcu_read_lock();
7967 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8216 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7968 8217
7969 if (!sd || !sd->nohz_idle) 8218 if (!sd || !sd->nohz_idle)
7970 goto unlock; 8219 goto unlock;
7971 sd->nohz_idle = 0; 8220 sd->nohz_idle = 0;
7972 8221
7973 atomic_inc(&sd->groups->sgc->nr_busy_cpus); 8222 atomic_inc(&sd->shared->nr_busy_cpus);
7974unlock: 8223unlock:
7975 rcu_read_unlock(); 8224 rcu_read_unlock();
7976} 8225}
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void)
7981 int cpu = smp_processor_id(); 8230 int cpu = smp_processor_id();
7982 8231
7983 rcu_read_lock(); 8232 rcu_read_lock();
7984 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8233 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7985 8234
7986 if (!sd || sd->nohz_idle) 8235 if (!sd || sd->nohz_idle)
7987 goto unlock; 8236 goto unlock;
7988 sd->nohz_idle = 1; 8237 sd->nohz_idle = 1;
7989 8238
7990 atomic_dec(&sd->groups->sgc->nr_busy_cpus); 8239 atomic_dec(&sd->shared->nr_busy_cpus);
7991unlock: 8240unlock:
7992 rcu_read_unlock(); 8241 rcu_read_unlock();
7993} 8242}
@@ -8214,8 +8463,8 @@ end:
8214static inline bool nohz_kick_needed(struct rq *rq) 8463static inline bool nohz_kick_needed(struct rq *rq)
8215{ 8464{
8216 unsigned long now = jiffies; 8465 unsigned long now = jiffies;
8466 struct sched_domain_shared *sds;
8217 struct sched_domain *sd; 8467 struct sched_domain *sd;
8218 struct sched_group_capacity *sgc;
8219 int nr_busy, cpu = rq->cpu; 8468 int nr_busy, cpu = rq->cpu;
8220 bool kick = false; 8469 bool kick = false;
8221 8470
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
8243 return true; 8492 return true;
8244 8493
8245 rcu_read_lock(); 8494 rcu_read_lock();
8246 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8495 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
8247 if (sd) { 8496 if (sds) {
8248 sgc = sd->groups->sgc; 8497 /*
8249 nr_busy = atomic_read(&sgc->nr_busy_cpus); 8498 * XXX: write a coherent comment on why we do this.
8250 8499 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
8500 */
8501 nr_busy = atomic_read(&sds->nr_busy_cpus);
8251 if (nr_busy > 1) { 8502 if (nr_busy > 1) {
8252 kick = true; 8503 kick = true;
8253 goto unlock; 8504 goto unlock;
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8283 * run_rebalance_domains is triggered when needed from the scheduler tick. 8534 * run_rebalance_domains is triggered when needed from the scheduler tick.
8284 * Also triggered for nohz idle balancing (with nohz_balancing_kick set). 8535 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8285 */ 8536 */
8286static void run_rebalance_domains(struct softirq_action *h) 8537static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
8287{ 8538{
8288 struct rq *this_rq = this_rq(); 8539 struct rq *this_rq = this_rq();
8289 enum cpu_idle_type idle = this_rq->idle_balance ? 8540 enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
8441 struct sched_entity *se = &p->se; 8692 struct sched_entity *se = &p->se;
8442 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8693 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq); 8694 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8445 8695
8446 if (!vruntime_normalized(p)) { 8696 if (!vruntime_normalized(p)) {
8447 /* 8697 /*
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
8453 } 8703 }
8454 8704
8455 /* Catch up with the cfs_rq and remove our load when we leave */ 8705 /* Catch up with the cfs_rq and remove our load when we leave */
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8706 update_cfs_rq_load_avg(now, cfs_rq, false);
8457 detach_entity_load_avg(cfs_rq, se); 8707 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update) 8708 update_tg_load_avg(cfs_rq, false);
8459 update_tg_load_avg(cfs_rq, false);
8460} 8709}
8461 8710
8462static void attach_task_cfs_rq(struct task_struct *p) 8711static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
8464 struct sched_entity *se = &p->se; 8713 struct sched_entity *se = &p->se;
8465 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8714 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq); 8715 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8468 8716
8469#ifdef CONFIG_FAIR_GROUP_SCHED 8717#ifdef CONFIG_FAIR_GROUP_SCHED
8470 /* 8718 /*
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
8475#endif 8723#endif
8476 8724
8477 /* Synchronize task with its cfs_rq */ 8725 /* Synchronize task with its cfs_rq */
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8726 update_cfs_rq_load_avg(now, cfs_rq, false);
8479 attach_entity_load_avg(cfs_rq, se); 8727 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update) 8728 update_tg_load_avg(cfs_rq, false);
8481 update_tg_load_avg(cfs_rq, false);
8482 8729
8483 if (!vruntime_normalized(p)) 8730 if (!vruntime_normalized(p))
8484 se->vruntime += cfs_rq->min_vruntime; 8731 se->vruntime += cfs_rq->min_vruntime;
@@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8592{ 8839{
8593 struct sched_entity *se; 8840 struct sched_entity *se;
8594 struct cfs_rq *cfs_rq; 8841 struct cfs_rq *cfs_rq;
8595 struct rq *rq;
8596 int i; 8842 int i;
8597 8843
8598 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8844 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8607 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8853 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8608 8854
8609 for_each_possible_cpu(i) { 8855 for_each_possible_cpu(i) {
8610 rq = cpu_rq(i);
8611
8612 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8856 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8613 GFP_KERNEL, cpu_to_node(i)); 8857 GFP_KERNEL, cpu_to_node(i));
8614 if (!cfs_rq) 8858 if (!cfs_rq)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873cfc75c..1d8718d5300d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
16 16
17#include "sched.h" 17#include "sched.h"
18 18
19/* Linker adds these: start and end of __cpuidle functions */
20extern char __cpuidle_text_start[], __cpuidle_text_end[];
21
19/** 22/**
20 * sched_idle_set_state - Record idle state for the current CPU. 23 * sched_idle_set_state - Record idle state for the current CPU.
21 * @idle_state: State to record. 24 * @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
53__setup("hlt", cpu_idle_nopoll_setup); 56__setup("hlt", cpu_idle_nopoll_setup);
54#endif 57#endif
55 58
56static inline int cpu_idle_poll(void) 59static noinline int __cpuidle cpu_idle_poll(void)
57{ 60{
58 rcu_idle_enter(); 61 rcu_idle_enter();
59 trace_cpu_idle_rcuidle(0, smp_processor_id()); 62 trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
84 * 87 *
85 * To use when the cpuidle framework cannot be used. 88 * To use when the cpuidle framework cannot be used.
86 */ 89 */
87void default_idle_call(void) 90void __cpuidle default_idle_call(void)
88{ 91{
89 if (current_clr_polling_and_test()) { 92 if (current_clr_polling_and_test()) {
90 local_irq_enable(); 93 local_irq_enable();
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void)
271 } 274 }
272} 275}
273 276
277bool cpu_in_idle(unsigned long pc)
278{
279 return pc >= (unsigned long)__cpuidle_text_start &&
280 pc < (unsigned long)__cpuidle_text_end;
281}
282
274void cpu_startup_entry(enum cpuhp_state state) 283void cpu_startup_entry(enum cpuhp_state state)
275{ 284{
276 /* 285 /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..5405d3feb112 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) 27pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
28{ 28{
29 put_prev_task(rq, prev); 29 put_prev_task(rq, prev);
30 30 update_idle_core(rq);
31 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq->sched_goidle);
32 return rq->idle; 32 return rq->idle;
33} 33}
34 34
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..2516b8df6dbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq)
957 if (unlikely((s64)delta_exec <= 0)) 957 if (unlikely((s64)delta_exec <= 0))
958 return; 958 return;
959 959
960 /* Kick cpufreq (see the comment in linux/cpufreq.h). */ 960 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
961 if (cpu_of(rq) == smp_processor_id()) 961 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
962 cpufreq_trigger_update(rq_clock(rq));
963 962
964 schedstat_set(curr->se.statistics.exec_max, 963 schedstat_set(curr->se.statistics.exec_max,
965 max(curr->se.statistics.exec_max, delta_exec)); 964 max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc5114004..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/u64_stats_sync.h>
5#include <linux/sched/deadline.h> 6#include <linux/sched/deadline.h>
6#include <linux/binfmts.h> 7#include <linux/binfmts.h>
7#include <linux/mutex.h> 8#include <linux/mutex.h>
@@ -15,6 +16,12 @@
15#include "cpudeadline.h" 16#include "cpudeadline.h"
16#include "cpuacct.h" 17#include "cpuacct.h"
17 18
19#ifdef CONFIG_SCHED_DEBUG
20#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
21#else
22#define SCHED_WARN_ON(x) ((void)(x))
23#endif
24
18struct rq; 25struct rq;
19struct cpuidle_state; 26struct cpuidle_state;
20 27
@@ -565,6 +572,8 @@ struct root_domain {
565 */ 572 */
566 cpumask_var_t rto_mask; 573 cpumask_var_t rto_mask;
567 struct cpupri cpupri; 574 struct cpupri cpupri;
575
576 unsigned long max_cpu_capacity;
568}; 577};
569 578
570extern struct root_domain def_root_domain; 579extern struct root_domain def_root_domain;
@@ -597,7 +606,6 @@ struct rq {
597#ifdef CONFIG_SMP 606#ifdef CONFIG_SMP
598 unsigned long last_load_update_tick; 607 unsigned long last_load_update_tick;
599#endif /* CONFIG_SMP */ 608#endif /* CONFIG_SMP */
600 u64 nohz_stamp;
601 unsigned long nohz_flags; 609 unsigned long nohz_flags;
602#endif /* CONFIG_NO_HZ_COMMON */ 610#endif /* CONFIG_NO_HZ_COMMON */
603#ifdef CONFIG_NO_HZ_FULL 611#ifdef CONFIG_NO_HZ_FULL
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
723#endif 731#endif
724} 732}
725 733
734
735#ifdef CONFIG_SCHED_SMT
736
737extern struct static_key_false sched_smt_present;
738
739extern void __update_idle_core(struct rq *rq);
740
741static inline void update_idle_core(struct rq *rq)
742{
743 if (static_branch_unlikely(&sched_smt_present))
744 __update_idle_core(rq);
745}
746
747#else
748static inline void update_idle_core(struct rq *rq) { }
749#endif
750
726DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 751DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
727 752
728#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 753#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
857DECLARE_PER_CPU(struct sched_domain *, sd_llc); 882DECLARE_PER_CPU(struct sched_domain *, sd_llc);
858DECLARE_PER_CPU(int, sd_llc_size); 883DECLARE_PER_CPU(int, sd_llc_size);
859DECLARE_PER_CPU(int, sd_llc_id); 884DECLARE_PER_CPU(int, sd_llc_id);
885DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
860DECLARE_PER_CPU(struct sched_domain *, sd_numa); 886DECLARE_PER_CPU(struct sched_domain *, sd_numa);
861DECLARE_PER_CPU(struct sched_domain *, sd_busy);
862DECLARE_PER_CPU(struct sched_domain *, sd_asym); 887DECLARE_PER_CPU(struct sched_domain *, sd_asym);
863 888
864struct sched_group_capacity { 889struct sched_group_capacity {
@@ -870,10 +895,6 @@ struct sched_group_capacity {
870 unsigned int capacity; 895 unsigned int capacity;
871 unsigned long next_update; 896 unsigned long next_update;
872 int imbalance; /* XXX unrelated to capacity but shared group state */ 897 int imbalance; /* XXX unrelated to capacity but shared group state */
873 /*
874 * Number of busy cpus in this group.
875 */
876 atomic_t nr_busy_cpus;
877 898
878 unsigned long cpumask[0]; /* iteration mask */ 899 unsigned long cpumask[0]; /* iteration mask */
879}; 900};
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1000 * per-task data have been completed by this moment. 1021 * per-task data have been completed by this moment.
1001 */ 1022 */
1002 smp_wmb(); 1023 smp_wmb();
1024#ifdef CONFIG_THREAD_INFO_IN_TASK
1025 p->cpu = cpu;
1026#else
1003 task_thread_info(p)->cpu = cpu; 1027 task_thread_info(p)->cpu = cpu;
1028#endif
1004 p->wake_cpu = cpu; 1029 p->wake_cpu = cpu;
1005#endif 1030#endif
1006} 1031}
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1260 prev->sched_class->put_prev_task(rq, prev); 1285 prev->sched_class->put_prev_task(rq, prev);
1261} 1286}
1262 1287
1288static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
1289{
1290 curr->sched_class->set_curr_task(rq);
1291}
1292
1263#define sched_class_highest (&stop_sched_class) 1293#define sched_class_highest (&stop_sched_class)
1264#define for_each_class(class) \ 1294#define for_each_class(class) \
1265 for (class = sched_class_highest; class; class = class->next) 1295 for (class = sched_class_highest; class; class = class->next)
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq,
1290 1320
1291static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1321static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1292{ 1322{
1293 WARN_ON(!rcu_read_lock_held()); 1323 SCHED_WARN_ON(!rcu_read_lock_held());
1294 return rq->idle_state; 1324 return rq->idle_state;
1295} 1325}
1296#else 1326#else
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
1710#endif 1740#endif
1711 1741
1712#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1742#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1743struct irqtime {
1744 u64 hardirq_time;
1745 u64 softirq_time;
1746 u64 irq_start_time;
1747 struct u64_stats_sync sync;
1748};
1713 1749
1714DECLARE_PER_CPU(u64, cpu_hardirq_time); 1750DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
1715DECLARE_PER_CPU(u64, cpu_softirq_time);
1716
1717#ifndef CONFIG_64BIT
1718DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1719
1720static inline void irq_time_write_begin(void)
1721{
1722 __this_cpu_inc(irq_time_seq.sequence);
1723 smp_wmb();
1724}
1725
1726static inline void irq_time_write_end(void)
1727{
1728 smp_wmb();
1729 __this_cpu_inc(irq_time_seq.sequence);
1730}
1731 1751
1732static inline u64 irq_time_read(int cpu) 1752static inline u64 irq_time_read(int cpu)
1733{ 1753{
1734 u64 irq_time; 1754 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
1735 unsigned seq; 1755 unsigned int seq;
1756 u64 total;
1736 1757
1737 do { 1758 do {
1738 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); 1759 seq = __u64_stats_fetch_begin(&irqtime->sync);
1739 irq_time = per_cpu(cpu_softirq_time, cpu) + 1760 total = irqtime->softirq_time + irqtime->hardirq_time;
1740 per_cpu(cpu_hardirq_time, cpu); 1761 } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
1741 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1742
1743 return irq_time;
1744}
1745#else /* CONFIG_64BIT */
1746static inline void irq_time_write_begin(void)
1747{
1748}
1749
1750static inline void irq_time_write_end(void)
1751{
1752}
1753 1762
1754static inline u64 irq_time_read(int cpu) 1763 return total;
1755{
1756 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1757} 1764}
1758#endif /* CONFIG_64BIT */
1759#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1765#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1760 1766
1761#ifdef CONFIG_CPU_FREQ 1767#ifdef CONFIG_CPU_FREQ
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
1763 1769
1764/** 1770/**
1765 * cpufreq_update_util - Take a note about CPU utilization changes. 1771 * cpufreq_update_util - Take a note about CPU utilization changes.
1766 * @time: Current time. 1772 * @rq: Runqueue to carry out the update for.
1767 * @util: Current utilization. 1773 * @flags: Update reason flags.
1768 * @max: Utilization ceiling.
1769 * 1774 *
1770 * This function is called by the scheduler on every invocation of 1775 * This function is called by the scheduler on the CPU whose utilization is
1771 * update_load_avg() on the CPU whose utilization is being updated. 1776 * being updated.
1772 * 1777 *
1773 * It can only be called from RCU-sched read-side critical sections. 1778 * It can only be called from RCU-sched read-side critical sections.
1774 */
1775static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
1776{
1777 struct update_util_data *data;
1778
1779 data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
1780 if (data)
1781 data->func(data, time, util, max);
1782}
1783
1784/**
1785 * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
1786 * @time: Current time.
1787 * 1779 *
1788 * The way cpufreq is currently arranged requires it to evaluate the CPU 1780 * The way cpufreq is currently arranged requires it to evaluate the CPU
1789 * performance state (frequency/voltage) on a regular basis to prevent it from 1781 * performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
1797 * but that really is a band-aid. Going forward it should be replaced with 1789 * but that really is a band-aid. Going forward it should be replaced with
1798 * solutions targeted more specifically at RT and DL tasks. 1790 * solutions targeted more specifically at RT and DL tasks.
1799 */ 1791 */
1800static inline void cpufreq_trigger_update(u64 time) 1792static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
1793{
1794 struct update_util_data *data;
1795
1796 data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
1797 if (data)
1798 data->func(data, rq_clock(rq), flags);
1799}
1800
1801static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
1801{ 1802{
1802 cpufreq_update_util(time, ULONG_MAX, 0); 1803 if (cpu_of(rq) == smp_processor_id())
1804 cpufreq_update_util(rq, flags);
1803} 1805}
1804#else 1806#else
1805static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} 1807static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
1806static inline void cpufreq_trigger_update(u64 time) {} 1808static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
1807#endif /* CONFIG_CPU_FREQ */ 1809#endif /* CONFIG_CPU_FREQ */
1808 1810
1809#ifdef arch_scale_freq_capacity 1811#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 78955cbea31c..34659a853505 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
29 if (rq) 29 if (rq)
30 rq->rq_sched_info.run_delay += delta; 30 rq->rq_sched_info.run_delay += delta;
31} 31}
32# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 32#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
33# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) 33#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
34# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) 34#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
35# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 35#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
36# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) 36#define schedstat_val(var) (var)
37#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
37 38
38#else /* !CONFIG_SCHEDSTATS */ 39#else /* !CONFIG_SCHEDSTATS */
39static inline void 40static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
45static inline void 46static inline void
46rq_sched_info_depart(struct rq *rq, unsigned long long delta) 47rq_sched_info_depart(struct rq *rq, unsigned long long delta)
47{} 48{}
48# define schedstat_enabled() 0 49#define schedstat_enabled() 0
49# define schedstat_inc(rq, field) do { } while (0) 50#define schedstat_inc(var) do { } while (0)
50# define schedstat_add(rq, field, amt) do { } while (0) 51#define schedstat_add(var, amt) do { } while (0)
51# define schedstat_set(var, val) do { } while (0) 52#define schedstat_set(var, val) do { } while (0)
52# define schedstat_val(rq, field) 0 53#define schedstat_val(var) 0
53#endif 54#define schedstat_val_or_zero(var) 0
55#endif /* CONFIG_SCHEDSTATS */
54 56
55#ifdef CONFIG_SCHED_INFO 57#ifdef CONFIG_SCHED_INFO
56static inline void sched_info_reset_dequeued(struct task_struct *t) 58static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
196} 196}
197EXPORT_SYMBOL(prepare_to_wait_exclusive); 197EXPORT_SYMBOL(prepare_to_wait_exclusive);
198 198
199long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) 199void init_wait_entry(wait_queue_t *wait, int flags)
200{ 200{
201 unsigned long flags; 201 wait->flags = flags;
202
203 if (signal_pending_state(state, current))
204 return -ERESTARTSYS;
205
206 wait->private = current; 202 wait->private = current;
207 wait->func = autoremove_wake_function; 203 wait->func = autoremove_wake_function;
204 INIT_LIST_HEAD(&wait->task_list);
205}
206EXPORT_SYMBOL(init_wait_entry);
207
208long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
209{
210 unsigned long flags;
211 long ret = 0;
208 212
209 spin_lock_irqsave(&q->lock, flags); 213 spin_lock_irqsave(&q->lock, flags);
210 if (list_empty(&wait->task_list)) { 214 if (unlikely(signal_pending_state(state, current))) {
211 if (wait->flags & WQ_FLAG_EXCLUSIVE) 215 /*
212 __add_wait_queue_tail(q, wait); 216 * Exclusive waiter must not fail if it was selected by wakeup,
213 else 217 * it should "consume" the condition we were waiting for.
214 __add_wait_queue(q, wait); 218 *
219 * The caller will recheck the condition and return success if
220 * we were already woken up, we can not miss the event because
221 * wakeup locks/unlocks the same q->lock.
222 *
223 * But we need to ensure that set-condition + wakeup after that
224 * can't see us, it should wake up another exclusive waiter if
225 * we fail.
226 */
227 list_del_init(&wait->task_list);
228 ret = -ERESTARTSYS;
229 } else {
230 if (list_empty(&wait->task_list)) {
231 if (wait->flags & WQ_FLAG_EXCLUSIVE)
232 __add_wait_queue_tail(q, wait);
233 else
234 __add_wait_queue(q, wait);
235 }
236 set_current_state(state);
215 } 237 }
216 set_current_state(state);
217 spin_unlock_irqrestore(&q->lock, flags); 238 spin_unlock_irqrestore(&q->lock, flags);
218 239
219 return 0; 240 return ret;
220} 241}
221EXPORT_SYMBOL(prepare_to_wait_event); 242EXPORT_SYMBOL(prepare_to_wait_event);
222 243
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
255} 276}
256EXPORT_SYMBOL(finish_wait); 277EXPORT_SYMBOL(finish_wait);
257 278
258/**
259 * abort_exclusive_wait - abort exclusive waiting in a queue
260 * @q: waitqueue waited on
261 * @wait: wait descriptor
262 * @mode: runstate of the waiter to be woken
263 * @key: key to identify a wait bit queue or %NULL
264 *
265 * Sets current thread back to running state and removes
266 * the wait descriptor from the given waitqueue if still
267 * queued.
268 *
269 * Wakes up the next waiter if the caller is concurrently
270 * woken up through the queue.
271 *
272 * This prevents waiter starvation where an exclusive waiter
273 * aborts and is woken up concurrently and no one wakes up
274 * the next waiter.
275 */
276void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
277 unsigned int mode, void *key)
278{
279 unsigned long flags;
280
281 __set_current_state(TASK_RUNNING);
282 spin_lock_irqsave(&q->lock, flags);
283 if (!list_empty(&wait->task_list))
284 list_del_init(&wait->task_list);
285 else if (waitqueue_active(q))
286 __wake_up_locked_key(q, mode, key);
287 spin_unlock_irqrestore(&q->lock, flags);
288}
289EXPORT_SYMBOL(abort_exclusive_wait);
290
291int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 279int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
292{ 280{
293 int ret = default_wake_function(wait, mode, sync, key); 281 int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
425__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 413__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
426 wait_bit_action_f *action, unsigned mode) 414 wait_bit_action_f *action, unsigned mode)
427{ 415{
428 do { 416 int ret = 0;
429 int ret;
430 417
418 for (;;) {
431 prepare_to_wait_exclusive(wq, &q->wait, mode); 419 prepare_to_wait_exclusive(wq, &q->wait, mode);
432 if (!test_bit(q->key.bit_nr, q->key.flags)) 420 if (test_bit(q->key.bit_nr, q->key.flags)) {
433 continue; 421 ret = action(&q->key, mode);
434 ret = action(&q->key, mode); 422 /*
435 if (!ret) 423 * See the comment in prepare_to_wait_event().
436 continue; 424 * finish_wait() does not necessarily takes wq->lock,
437 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 425 * but test_and_set_bit() implies mb() which pairs with
438 return ret; 426 * smp_mb__after_atomic() before wake_up_page().
439 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 427 */
440 finish_wait(wq, &q->wait); 428 if (ret)
441 return 0; 429 finish_wait(wq, &q->wait);
430 }
431 if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
432 if (!ret)
433 finish_wait(wq, &q->wait);
434 return 0;
435 } else if (ret) {
436 return ret;
437 }
438 }
442} 439}
443EXPORT_SYMBOL(__wait_on_bit_lock); 440EXPORT_SYMBOL(__wait_on_bit_lock);
444 441
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit)
483} 480}
484EXPORT_SYMBOL(wake_up_bit); 481EXPORT_SYMBOL(wake_up_bit);
485 482
486wait_queue_head_t *bit_waitqueue(void *word, int bit)
487{
488 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
489 const struct zone *zone = page_zone(virt_to_page(word));
490 unsigned long val = (unsigned long)word << shift | bit;
491
492 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
493}
494EXPORT_SYMBOL(bit_waitqueue);
495
496/* 483/*
497 * Manipulate the atomic_t address to produce a better bit waitqueue table hash 484 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
498 * index (we're keying off bit -1, but that would produce a horrible hash 485 * index (we're keying off bit -1, but that would produce a horrible hash