diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/auto_group.c | 36 | ||||
-rw-r--r-- | kernel/sched/core.c | 362 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 153 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 3 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 122 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 87 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 83 | ||||
-rw-r--r-- | kernel/sched/debug.c | 106 | ||||
-rw-r--r-- | kernel/sched/fair.c | 794 | ||||
-rw-r--r-- | kernel/sched/idle.c | 13 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 136 | ||||
-rw-r--r-- | kernel/sched/stats.h | 24 | ||||
-rw-r--r-- | kernel/sched/wait.c | 123 |
16 files changed, 1286 insertions, 767 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
111 | { | 111 | { |
112 | if (tg != &root_task_group) | 112 | if (tg != &root_task_group) |
113 | return false; | 113 | return false; |
114 | |||
115 | /* | 114 | /* |
116 | * We can only assume the task group can't go away on us if | 115 | * If we race with autogroup_move_group() the caller can use the old |
117 | * autogroup_move_group() can see us on ->thread_group list. | 116 | * value of signal->autogroup but in this case sched_move_task() will |
117 | * be called again before autogroup_kref_put(). | ||
118 | * | ||
119 | * However, there is no way sched_autogroup_exit_task() could tell us | ||
120 | * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. | ||
118 | */ | 121 | */ |
119 | if (p->flags & PF_EXITING) | 122 | if (p->flags & PF_EXITING) |
120 | return false; | 123 | return false; |
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
122 | return true; | 125 | return true; |
123 | } | 126 | } |
124 | 127 | ||
128 | void sched_autogroup_exit_task(struct task_struct *p) | ||
129 | { | ||
130 | /* | ||
131 | * We are going to call exit_notify() and autogroup_move_group() can't | ||
132 | * see this thread after that: we can no longer use signal->autogroup. | ||
133 | * See the PF_EXITING check in task_wants_autogroup(). | ||
134 | */ | ||
135 | sched_move_task(p); | ||
136 | } | ||
137 | |||
125 | static void | 138 | static void |
126 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 139 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
127 | { | 140 | { |
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
138 | } | 151 | } |
139 | 152 | ||
140 | p->signal->autogroup = autogroup_kref_get(ag); | 153 | p->signal->autogroup = autogroup_kref_get(ag); |
141 | 154 | /* | |
142 | if (!READ_ONCE(sysctl_sched_autogroup_enabled)) | 155 | * We can't avoid sched_move_task() after we changed signal->autogroup, |
143 | goto out; | 156 | * this process can already run with task_group() == prev->tg or we can |
144 | 157 | * race with cgroup code which can read autogroup = prev under rq->lock. | |
158 | * In the latter case for_each_thread() can not miss a migrating thread, | ||
159 | * cpu_cgroup_attach() must not be possible after cgroup_exit() and it | ||
160 | * can't be removed from thread list, we hold ->siglock. | ||
161 | * | ||
162 | * If an exiting thread was already removed from thread list we rely on | ||
163 | * sched_autogroup_exit_task(). | ||
164 | */ | ||
145 | for_each_thread(p, t) | 165 | for_each_thread(p, t) |
146 | sched_move_task(t); | 166 | sched_move_task(t); |
147 | out: | 167 | |
148 | unlock_task_sighand(p, &flags); | 168 | unlock_task_sighand(p, &flags); |
149 | autogroup_kref_put(prev); | 169 | autogroup_kref_put(prev); |
150 | } | 170 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44817c640e99..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
581 | * If needed we can still optimize that later with an | 581 | * If needed we can still optimize that later with an |
582 | * empty IRQ. | 582 | * empty IRQ. |
583 | */ | 583 | */ |
584 | if (cpu_is_offline(cpu)) | ||
585 | return true; /* Don't try to wake offline CPUs. */ | ||
584 | if (tick_nohz_full_cpu(cpu)) { | 586 | if (tick_nohz_full_cpu(cpu)) { |
585 | if (cpu != smp_processor_id() || | 587 | if (cpu != smp_processor_id() || |
586 | tick_nohz_tick_stopped()) | 588 | tick_nohz_tick_stopped()) |
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
591 | return false; | 593 | return false; |
592 | } | 594 | } |
593 | 595 | ||
596 | /* | ||
597 | * Wake up the specified CPU. If the CPU is going offline, it is the | ||
598 | * caller's responsibility to deal with the lost wakeup, for example, | ||
599 | * by hooking into the CPU_DEAD notifier like timers and hrtimers do. | ||
600 | */ | ||
594 | void wake_up_nohz_cpu(int cpu) | 601 | void wake_up_nohz_cpu(int cpu) |
595 | { | 602 | { |
596 | if (!wake_up_full_nohz_cpu(cpu)) | 603 | if (!wake_up_full_nohz_cpu(cpu)) |
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data) | |||
1063 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because | 1070 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because |
1064 | * we're holding p->pi_lock. | 1071 | * we're holding p->pi_lock. |
1065 | */ | 1072 | */ |
1066 | if (task_rq(p) == rq && task_on_rq_queued(p)) | 1073 | if (task_rq(p) == rq) { |
1067 | rq = __migrate_task(rq, p, arg->dest_cpu); | 1074 | if (task_on_rq_queued(p)) |
1075 | rq = __migrate_task(rq, p, arg->dest_cpu); | ||
1076 | else | ||
1077 | p->wake_cpu = arg->dest_cpu; | ||
1078 | } | ||
1068 | raw_spin_unlock(&rq->lock); | 1079 | raw_spin_unlock(&rq->lock); |
1069 | raw_spin_unlock(&p->pi_lock); | 1080 | raw_spin_unlock(&p->pi_lock); |
1070 | 1081 | ||
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1105 | 1116 | ||
1106 | p->sched_class->set_cpus_allowed(p, new_mask); | 1117 | p->sched_class->set_cpus_allowed(p, new_mask); |
1107 | 1118 | ||
1108 | if (running) | ||
1109 | p->sched_class->set_curr_task(rq); | ||
1110 | if (queued) | 1119 | if (queued) |
1111 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 1120 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
1121 | if (running) | ||
1122 | set_curr_task(rq, p); | ||
1112 | } | 1123 | } |
1113 | 1124 | ||
1114 | /* | 1125 | /* |
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1265 | /* | 1276 | /* |
1266 | * Task isn't running anymore; make it appear like we migrated | 1277 | * Task isn't running anymore; make it appear like we migrated |
1267 | * it before it went to sleep. This means on wakeup we make the | 1278 | * it before it went to sleep. This means on wakeup we make the |
1268 | * previous cpu our targer instead of where it really is. | 1279 | * previous cpu our target instead of where it really is. |
1269 | */ | 1280 | */ |
1270 | p->wake_cpu = cpu; | 1281 | p->wake_cpu = cpu; |
1271 | } | 1282 | } |
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1629 | static void | 1640 | static void |
1630 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | 1641 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
1631 | { | 1642 | { |
1632 | #ifdef CONFIG_SCHEDSTATS | 1643 | struct rq *rq; |
1633 | struct rq *rq = this_rq(); | ||
1634 | 1644 | ||
1635 | #ifdef CONFIG_SMP | 1645 | if (!schedstat_enabled()) |
1636 | int this_cpu = smp_processor_id(); | 1646 | return; |
1637 | 1647 | ||
1638 | if (cpu == this_cpu) { | 1648 | rq = this_rq(); |
1639 | schedstat_inc(rq, ttwu_local); | 1649 | |
1640 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 1650 | #ifdef CONFIG_SMP |
1651 | if (cpu == rq->cpu) { | ||
1652 | schedstat_inc(rq->ttwu_local); | ||
1653 | schedstat_inc(p->se.statistics.nr_wakeups_local); | ||
1641 | } else { | 1654 | } else { |
1642 | struct sched_domain *sd; | 1655 | struct sched_domain *sd; |
1643 | 1656 | ||
1644 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 1657 | schedstat_inc(p->se.statistics.nr_wakeups_remote); |
1645 | rcu_read_lock(); | 1658 | rcu_read_lock(); |
1646 | for_each_domain(this_cpu, sd) { | 1659 | for_each_domain(rq->cpu, sd) { |
1647 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 1660 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
1648 | schedstat_inc(sd, ttwu_wake_remote); | 1661 | schedstat_inc(sd->ttwu_wake_remote); |
1649 | break; | 1662 | break; |
1650 | } | 1663 | } |
1651 | } | 1664 | } |
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1653 | } | 1666 | } |
1654 | 1667 | ||
1655 | if (wake_flags & WF_MIGRATED) | 1668 | if (wake_flags & WF_MIGRATED) |
1656 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 1669 | schedstat_inc(p->se.statistics.nr_wakeups_migrate); |
1657 | |||
1658 | #endif /* CONFIG_SMP */ | 1670 | #endif /* CONFIG_SMP */ |
1659 | 1671 | ||
1660 | schedstat_inc(rq, ttwu_count); | 1672 | schedstat_inc(rq->ttwu_count); |
1661 | schedstat_inc(p, se.statistics.nr_wakeups); | 1673 | schedstat_inc(p->se.statistics.nr_wakeups); |
1662 | 1674 | ||
1663 | if (wake_flags & WF_SYNC) | 1675 | if (wake_flags & WF_SYNC) |
1664 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 1676 | schedstat_inc(p->se.statistics.nr_wakeups_sync); |
1665 | |||
1666 | #endif /* CONFIG_SCHEDSTATS */ | ||
1667 | } | 1677 | } |
1668 | 1678 | ||
1669 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1679 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
@@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2084 | 2094 | ||
2085 | ttwu_queue(p, cpu, wake_flags); | 2095 | ttwu_queue(p, cpu, wake_flags); |
2086 | stat: | 2096 | stat: |
2087 | if (schedstat_enabled()) | 2097 | ttwu_stat(p, cpu, wake_flags); |
2088 | ttwu_stat(p, cpu, wake_flags); | ||
2089 | out: | 2098 | out: |
2090 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2099 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2091 | 2100 | ||
@@ -2095,6 +2104,7 @@ out: | |||
2095 | /** | 2104 | /** |
2096 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2105 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2097 | * @p: the thread to be awakened | 2106 | * @p: the thread to be awakened |
2107 | * @cookie: context's cookie for pinning | ||
2098 | * | 2108 | * |
2099 | * Put @p on the run-queue if it's not already there. The caller must | 2109 | * Put @p on the run-queue if it's not already there. The caller must |
2100 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2110 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
@@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2133 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2143 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2134 | 2144 | ||
2135 | ttwu_do_wakeup(rq, p, 0, cookie); | 2145 | ttwu_do_wakeup(rq, p, 0, cookie); |
2136 | if (schedstat_enabled()) | 2146 | ttwu_stat(p, smp_processor_id(), 0); |
2137 | ttwu_stat(p, smp_processor_id(), 0); | ||
2138 | out: | 2147 | out: |
2139 | raw_spin_unlock(&p->pi_lock); | 2148 | raw_spin_unlock(&p->pi_lock); |
2140 | } | 2149 | } |
@@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2772 | * task and put them back on the free list. | 2781 | * task and put them back on the free list. |
2773 | */ | 2782 | */ |
2774 | kprobe_flush_task(prev); | 2783 | kprobe_flush_task(prev); |
2784 | |||
2785 | /* Task is done with its stack. */ | ||
2786 | put_task_stack(prev); | ||
2787 | |||
2775 | put_task_struct(prev); | 2788 | put_task_struct(prev); |
2776 | } | 2789 | } |
2777 | 2790 | ||
@@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { } | |||
3192 | */ | 3205 | */ |
3193 | static noinline void __schedule_bug(struct task_struct *prev) | 3206 | static noinline void __schedule_bug(struct task_struct *prev) |
3194 | { | 3207 | { |
3208 | /* Save this before calling printk(), since that will clobber it */ | ||
3209 | unsigned long preempt_disable_ip = get_preempt_disable_ip(current); | ||
3210 | |||
3195 | if (oops_in_progress) | 3211 | if (oops_in_progress) |
3196 | return; | 3212 | return; |
3197 | 3213 | ||
@@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3202 | print_modules(); | 3218 | print_modules(); |
3203 | if (irqs_disabled()) | 3219 | if (irqs_disabled()) |
3204 | print_irqtrace_events(prev); | 3220 | print_irqtrace_events(prev); |
3205 | #ifdef CONFIG_DEBUG_PREEMPT | 3221 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
3206 | if (in_atomic_preempt_off()) { | 3222 | && in_atomic_preempt_off()) { |
3207 | pr_err("Preemption disabled at:"); | 3223 | pr_err("Preemption disabled at:"); |
3208 | print_ip_sym(current->preempt_disable_ip); | 3224 | print_ip_sym(preempt_disable_ip); |
3209 | pr_cont("\n"); | 3225 | pr_cont("\n"); |
3210 | } | 3226 | } |
3211 | #endif | ||
3212 | if (panic_on_warn) | 3227 | if (panic_on_warn) |
3213 | panic("scheduling while atomic\n"); | 3228 | panic("scheduling while atomic\n"); |
3214 | 3229 | ||
@@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3234 | 3249 | ||
3235 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3250 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3236 | 3251 | ||
3237 | schedstat_inc(this_rq(), sched_count); | 3252 | schedstat_inc(this_rq()->sched_count); |
3238 | } | 3253 | } |
3239 | 3254 | ||
3240 | /* | 3255 | /* |
@@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt) | |||
3327 | rq = cpu_rq(cpu); | 3342 | rq = cpu_rq(cpu); |
3328 | prev = rq->curr; | 3343 | prev = rq->curr; |
3329 | 3344 | ||
3330 | /* | ||
3331 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
3332 | * however we must fix that up, otherwise the next task will see an | ||
3333 | * inconsistent (higher) preempt count. | ||
3334 | * | ||
3335 | * It also avoids the below schedule_debug() test from complaining | ||
3336 | * about this. | ||
3337 | */ | ||
3338 | if (unlikely(prev->state == TASK_DEAD)) | ||
3339 | preempt_enable_no_resched_notrace(); | ||
3340 | |||
3341 | schedule_debug(prev); | 3345 | schedule_debug(prev); |
3342 | 3346 | ||
3343 | if (sched_feat(HRTICK)) | 3347 | if (sched_feat(HRTICK)) |
@@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt) | |||
3403 | 3407 | ||
3404 | balance_callback(rq); | 3408 | balance_callback(rq); |
3405 | } | 3409 | } |
3406 | STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ | 3410 | |
3411 | void __noreturn do_task_dead(void) | ||
3412 | { | ||
3413 | /* | ||
3414 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
3415 | * when the following two conditions become true. | ||
3416 | * - There is race condition of mmap_sem (It is acquired by | ||
3417 | * exit_mm()), and | ||
3418 | * - SMI occurs before setting TASK_RUNINNG. | ||
3419 | * (or hypervisor of virtual machine switches to other guest) | ||
3420 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
3421 | * | ||
3422 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
3423 | * is held by try_to_wake_up() | ||
3424 | */ | ||
3425 | smp_mb(); | ||
3426 | raw_spin_unlock_wait(¤t->pi_lock); | ||
3427 | |||
3428 | /* causes final put_task_struct in finish_task_switch(). */ | ||
3429 | __set_current_state(TASK_DEAD); | ||
3430 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
3431 | __schedule(false); | ||
3432 | BUG(); | ||
3433 | /* Avoid "noreturn function does return". */ | ||
3434 | for (;;) | ||
3435 | cpu_relax(); /* For when BUG is null */ | ||
3436 | } | ||
3407 | 3437 | ||
3408 | static inline void sched_submit_work(struct task_struct *tsk) | 3438 | static inline void sched_submit_work(struct task_struct *tsk) |
3409 | { | 3439 | { |
@@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3687 | 3717 | ||
3688 | p->prio = prio; | 3718 | p->prio = prio; |
3689 | 3719 | ||
3690 | if (running) | ||
3691 | p->sched_class->set_curr_task(rq); | ||
3692 | if (queued) | 3720 | if (queued) |
3693 | enqueue_task(rq, p, queue_flag); | 3721 | enqueue_task(rq, p, queue_flag); |
3722 | if (running) | ||
3723 | set_curr_task(rq, p); | ||
3694 | 3724 | ||
3695 | check_class_changed(rq, p, prev_class, oldprio); | 3725 | check_class_changed(rq, p, prev_class, oldprio); |
3696 | out_unlock: | 3726 | out_unlock: |
@@ -3704,7 +3734,8 @@ out_unlock: | |||
3704 | 3734 | ||
3705 | void set_user_nice(struct task_struct *p, long nice) | 3735 | void set_user_nice(struct task_struct *p, long nice) |
3706 | { | 3736 | { |
3707 | int old_prio, delta, queued; | 3737 | bool queued, running; |
3738 | int old_prio, delta; | ||
3708 | struct rq_flags rf; | 3739 | struct rq_flags rf; |
3709 | struct rq *rq; | 3740 | struct rq *rq; |
3710 | 3741 | ||
@@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3726 | goto out_unlock; | 3757 | goto out_unlock; |
3727 | } | 3758 | } |
3728 | queued = task_on_rq_queued(p); | 3759 | queued = task_on_rq_queued(p); |
3760 | running = task_current(rq, p); | ||
3729 | if (queued) | 3761 | if (queued) |
3730 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3762 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3763 | if (running) | ||
3764 | put_prev_task(rq, p); | ||
3731 | 3765 | ||
3732 | p->static_prio = NICE_TO_PRIO(nice); | 3766 | p->static_prio = NICE_TO_PRIO(nice); |
3733 | set_load_weight(p); | 3767 | set_load_weight(p); |
@@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3744 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3778 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3745 | resched_curr(rq); | 3779 | resched_curr(rq); |
3746 | } | 3780 | } |
3781 | if (running) | ||
3782 | set_curr_task(rq, p); | ||
3747 | out_unlock: | 3783 | out_unlock: |
3748 | task_rq_unlock(rq, p, &rf); | 3784 | task_rq_unlock(rq, p, &rf); |
3749 | } | 3785 | } |
@@ -4243,8 +4279,6 @@ change: | |||
4243 | prev_class = p->sched_class; | 4279 | prev_class = p->sched_class; |
4244 | __setscheduler(rq, p, attr, pi); | 4280 | __setscheduler(rq, p, attr, pi); |
4245 | 4281 | ||
4246 | if (running) | ||
4247 | p->sched_class->set_curr_task(rq); | ||
4248 | if (queued) { | 4282 | if (queued) { |
4249 | /* | 4283 | /* |
4250 | * We enqueue to tail when the priority of a task is | 4284 | * We enqueue to tail when the priority of a task is |
@@ -4255,6 +4289,8 @@ change: | |||
4255 | 4289 | ||
4256 | enqueue_task(rq, p, queue_flags); | 4290 | enqueue_task(rq, p, queue_flags); |
4257 | } | 4291 | } |
4292 | if (running) | ||
4293 | set_curr_task(rq, p); | ||
4258 | 4294 | ||
4259 | check_class_changed(rq, p, prev_class, oldprio); | 4295 | check_class_changed(rq, p, prev_class, oldprio); |
4260 | preempt_disable(); /* avoid rq from going away on us */ | 4296 | preempt_disable(); /* avoid rq from going away on us */ |
@@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4846 | { | 4882 | { |
4847 | struct rq *rq = this_rq_lock(); | 4883 | struct rq *rq = this_rq_lock(); |
4848 | 4884 | ||
4849 | schedstat_inc(rq, yld_count); | 4885 | schedstat_inc(rq->yld_count); |
4850 | current->sched_class->yield_task(rq); | 4886 | current->sched_class->yield_task(rq); |
4851 | 4887 | ||
4852 | /* | 4888 | /* |
@@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4863 | return 0; | 4899 | return 0; |
4864 | } | 4900 | } |
4865 | 4901 | ||
4902 | #ifndef CONFIG_PREEMPT | ||
4866 | int __sched _cond_resched(void) | 4903 | int __sched _cond_resched(void) |
4867 | { | 4904 | { |
4868 | if (should_resched(0)) { | 4905 | if (should_resched(0)) { |
@@ -4872,6 +4909,7 @@ int __sched _cond_resched(void) | |||
4872 | return 0; | 4909 | return 0; |
4873 | } | 4910 | } |
4874 | EXPORT_SYMBOL(_cond_resched); | 4911 | EXPORT_SYMBOL(_cond_resched); |
4912 | #endif | ||
4875 | 4913 | ||
4876 | /* | 4914 | /* |
4877 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4915 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4997,7 +5035,7 @@ again: | |||
4997 | 5035 | ||
4998 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | 5036 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
4999 | if (yielded) { | 5037 | if (yielded) { |
5000 | schedstat_inc(rq, yld_count); | 5038 | schedstat_inc(rq->yld_count); |
5001 | /* | 5039 | /* |
5002 | * Make p's CPU reschedule; pick_next_entity takes care of | 5040 | * Make p's CPU reschedule; pick_next_entity takes care of |
5003 | * fairness. | 5041 | * fairness. |
@@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p) | |||
5154 | int ppid; | 5192 | int ppid; |
5155 | unsigned long state = p->state; | 5193 | unsigned long state = p->state; |
5156 | 5194 | ||
5195 | if (!try_get_task_stack(p)) | ||
5196 | return; | ||
5157 | if (state) | 5197 | if (state) |
5158 | state = __ffs(state) + 1; | 5198 | state = __ffs(state) + 1; |
5159 | printk(KERN_INFO "%-15.15s %c", p->comm, | 5199 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5160 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5200 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5161 | #if BITS_PER_LONG == 32 | ||
5162 | if (state == TASK_RUNNING) | ||
5163 | printk(KERN_CONT " running "); | ||
5164 | else | ||
5165 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); | ||
5166 | #else | ||
5167 | if (state == TASK_RUNNING) | 5201 | if (state == TASK_RUNNING) |
5168 | printk(KERN_CONT " running task "); | 5202 | printk(KERN_CONT " running task "); |
5169 | else | ||
5170 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); | ||
5171 | #endif | ||
5172 | #ifdef CONFIG_DEBUG_STACK_USAGE | 5203 | #ifdef CONFIG_DEBUG_STACK_USAGE |
5173 | free = stack_not_used(p); | 5204 | free = stack_not_used(p); |
5174 | #endif | 5205 | #endif |
@@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p) | |||
5183 | 5214 | ||
5184 | print_worker_info(KERN_INFO, p); | 5215 | print_worker_info(KERN_INFO, p); |
5185 | show_stack(p, NULL); | 5216 | show_stack(p, NULL); |
5217 | put_task_stack(p); | ||
5186 | } | 5218 | } |
5187 | 5219 | ||
5188 | void show_state_filter(unsigned long state_filter) | 5220 | void show_state_filter(unsigned long state_filter) |
@@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5417 | 5449 | ||
5418 | p->numa_preferred_nid = nid; | 5450 | p->numa_preferred_nid = nid; |
5419 | 5451 | ||
5420 | if (running) | ||
5421 | p->sched_class->set_curr_task(rq); | ||
5422 | if (queued) | 5452 | if (queued) |
5423 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 5453 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
5454 | if (running) | ||
5455 | set_curr_task(rq, p); | ||
5424 | task_rq_unlock(rq, p, &rf); | 5456 | task_rq_unlock(rq, p, &rf); |
5425 | } | 5457 | } |
5426 | #endif /* CONFIG_NUMA_BALANCING */ | 5458 | #endif /* CONFIG_NUMA_BALANCING */ |
@@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5717 | } | 5749 | } |
5718 | } | 5750 | } |
5719 | #else /* !CONFIG_SCHED_DEBUG */ | 5751 | #else /* !CONFIG_SCHED_DEBUG */ |
5752 | |||
5753 | # define sched_debug_enabled 0 | ||
5720 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5754 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5721 | static inline bool sched_debug(void) | 5755 | static inline bool sched_debug(void) |
5722 | { | 5756 | { |
@@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
5735 | SD_BALANCE_FORK | | 5769 | SD_BALANCE_FORK | |
5736 | SD_BALANCE_EXEC | | 5770 | SD_BALANCE_EXEC | |
5737 | SD_SHARE_CPUCAPACITY | | 5771 | SD_SHARE_CPUCAPACITY | |
5772 | SD_ASYM_CPUCAPACITY | | ||
5738 | SD_SHARE_PKG_RESOURCES | | 5773 | SD_SHARE_PKG_RESOURCES | |
5739 | SD_SHARE_POWERDOMAIN)) { | 5774 | SD_SHARE_POWERDOMAIN)) { |
5740 | if (sd->groups != sd->groups->next) | 5775 | if (sd->groups != sd->groups->next) |
@@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5765 | SD_BALANCE_NEWIDLE | | 5800 | SD_BALANCE_NEWIDLE | |
5766 | SD_BALANCE_FORK | | 5801 | SD_BALANCE_FORK | |
5767 | SD_BALANCE_EXEC | | 5802 | SD_BALANCE_EXEC | |
5803 | SD_ASYM_CPUCAPACITY | | ||
5768 | SD_SHARE_CPUCAPACITY | | 5804 | SD_SHARE_CPUCAPACITY | |
5769 | SD_SHARE_PKG_RESOURCES | | 5805 | SD_SHARE_PKG_RESOURCES | |
5770 | SD_PREFER_SIBLING | | 5806 | SD_PREFER_SIBLING | |
@@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) | |||
5909 | } while (sg != first); | 5945 | } while (sg != first); |
5910 | } | 5946 | } |
5911 | 5947 | ||
5912 | static void free_sched_domain(struct rcu_head *rcu) | 5948 | static void destroy_sched_domain(struct sched_domain *sd) |
5913 | { | 5949 | { |
5914 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
5915 | |||
5916 | /* | 5950 | /* |
5917 | * If its an overlapping domain it has private groups, iterate and | 5951 | * If its an overlapping domain it has private groups, iterate and |
5918 | * nuke them all. | 5952 | * nuke them all. |
@@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu) | |||
5923 | kfree(sd->groups->sgc); | 5957 | kfree(sd->groups->sgc); |
5924 | kfree(sd->groups); | 5958 | kfree(sd->groups); |
5925 | } | 5959 | } |
5960 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
5961 | kfree(sd->shared); | ||
5926 | kfree(sd); | 5962 | kfree(sd); |
5927 | } | 5963 | } |
5928 | 5964 | ||
5929 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | 5965 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) |
5930 | { | 5966 | { |
5931 | call_rcu(&sd->rcu, free_sched_domain); | 5967 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
5968 | |||
5969 | while (sd) { | ||
5970 | struct sched_domain *parent = sd->parent; | ||
5971 | destroy_sched_domain(sd); | ||
5972 | sd = parent; | ||
5973 | } | ||
5932 | } | 5974 | } |
5933 | 5975 | ||
5934 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | 5976 | static void destroy_sched_domains(struct sched_domain *sd) |
5935 | { | 5977 | { |
5936 | for (; sd; sd = sd->parent) | 5978 | if (sd) |
5937 | destroy_sched_domain(sd, cpu); | 5979 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); |
5938 | } | 5980 | } |
5939 | 5981 | ||
5940 | /* | 5982 | /* |
@@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5949 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5991 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5950 | DEFINE_PER_CPU(int, sd_llc_size); | 5992 | DEFINE_PER_CPU(int, sd_llc_size); |
5951 | DEFINE_PER_CPU(int, sd_llc_id); | 5993 | DEFINE_PER_CPU(int, sd_llc_id); |
5994 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
5952 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | 5995 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); |
5953 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
5954 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | 5996 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); |
5955 | 5997 | ||
5956 | static void update_top_cache_domain(int cpu) | 5998 | static void update_top_cache_domain(int cpu) |
5957 | { | 5999 | { |
6000 | struct sched_domain_shared *sds = NULL; | ||
5958 | struct sched_domain *sd; | 6001 | struct sched_domain *sd; |
5959 | struct sched_domain *busy_sd = NULL; | ||
5960 | int id = cpu; | 6002 | int id = cpu; |
5961 | int size = 1; | 6003 | int size = 1; |
5962 | 6004 | ||
@@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu) | |||
5964 | if (sd) { | 6006 | if (sd) { |
5965 | id = cpumask_first(sched_domain_span(sd)); | 6007 | id = cpumask_first(sched_domain_span(sd)); |
5966 | size = cpumask_weight(sched_domain_span(sd)); | 6008 | size = cpumask_weight(sched_domain_span(sd)); |
5967 | busy_sd = sd->parent; /* sd_busy */ | 6009 | sds = sd->shared; |
5968 | } | 6010 | } |
5969 | rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); | ||
5970 | 6011 | ||
5971 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6012 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5972 | per_cpu(sd_llc_size, cpu) = size; | 6013 | per_cpu(sd_llc_size, cpu) = size; |
5973 | per_cpu(sd_llc_id, cpu) = id; | 6014 | per_cpu(sd_llc_id, cpu) = id; |
6015 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
5974 | 6016 | ||
5975 | sd = lowest_flag_domain(cpu, SD_NUMA); | 6017 | sd = lowest_flag_domain(cpu, SD_NUMA); |
5976 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | 6018 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); |
@@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6006 | */ | 6048 | */ |
6007 | if (parent->flags & SD_PREFER_SIBLING) | 6049 | if (parent->flags & SD_PREFER_SIBLING) |
6008 | tmp->flags |= SD_PREFER_SIBLING; | 6050 | tmp->flags |= SD_PREFER_SIBLING; |
6009 | destroy_sched_domain(parent, cpu); | 6051 | destroy_sched_domain(parent); |
6010 | } else | 6052 | } else |
6011 | tmp = tmp->parent; | 6053 | tmp = tmp->parent; |
6012 | } | 6054 | } |
@@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6014 | if (sd && sd_degenerate(sd)) { | 6056 | if (sd && sd_degenerate(sd)) { |
6015 | tmp = sd; | 6057 | tmp = sd; |
6016 | sd = sd->parent; | 6058 | sd = sd->parent; |
6017 | destroy_sched_domain(tmp, cpu); | 6059 | destroy_sched_domain(tmp); |
6018 | if (sd) | 6060 | if (sd) |
6019 | sd->child = NULL; | 6061 | sd->child = NULL; |
6020 | } | 6062 | } |
@@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6024 | rq_attach_root(rq, rd); | 6066 | rq_attach_root(rq, rd); |
6025 | tmp = rq->sd; | 6067 | tmp = rq->sd; |
6026 | rcu_assign_pointer(rq->sd, sd); | 6068 | rcu_assign_pointer(rq->sd, sd); |
6027 | destroy_sched_domains(tmp, cpu); | 6069 | destroy_sched_domains(tmp); |
6028 | 6070 | ||
6029 | update_top_cache_domain(cpu); | 6071 | update_top_cache_domain(cpu); |
6030 | } | 6072 | } |
@@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | |||
6267 | return; | 6309 | return; |
6268 | 6310 | ||
6269 | update_group_capacity(sd, cpu); | 6311 | update_group_capacity(sd, cpu); |
6270 | atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); | ||
6271 | } | 6312 | } |
6272 | 6313 | ||
6273 | /* | 6314 | /* |
@@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
6355 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | 6396 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
6356 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | 6397 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
6357 | 6398 | ||
6399 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6400 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6401 | |||
6358 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | 6402 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
6359 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 6403 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
6360 | 6404 | ||
@@ -6374,26 +6418,37 @@ static int sched_domains_curr_level; | |||
6374 | /* | 6418 | /* |
6375 | * SD_flags allowed in topology descriptions. | 6419 | * SD_flags allowed in topology descriptions. |
6376 | * | 6420 | * |
6377 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | 6421 | * These flags are purely descriptive of the topology and do not prescribe |
6378 | * SD_SHARE_PKG_RESOURCES - describes shared caches | 6422 | * behaviour. Behaviour is artificial and mapped in the below sd_init() |
6379 | * SD_NUMA - describes NUMA topologies | 6423 | * function: |
6380 | * SD_SHARE_POWERDOMAIN - describes shared power domain | 6424 | * |
6425 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6426 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6427 | * SD_NUMA - describes NUMA topologies | ||
6428 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6429 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6381 | * | 6430 | * |
6382 | * Odd one out: | 6431 | * Odd one out, which beside describing the topology has a quirk also |
6383 | * SD_ASYM_PACKING - describes SMT quirks | 6432 | * prescribes the desired behaviour that goes along with it: |
6433 | * | ||
6434 | * SD_ASYM_PACKING - describes SMT quirks | ||
6384 | */ | 6435 | */ |
6385 | #define TOPOLOGY_SD_FLAGS \ | 6436 | #define TOPOLOGY_SD_FLAGS \ |
6386 | (SD_SHARE_CPUCAPACITY | \ | 6437 | (SD_SHARE_CPUCAPACITY | \ |
6387 | SD_SHARE_PKG_RESOURCES | \ | 6438 | SD_SHARE_PKG_RESOURCES | \ |
6388 | SD_NUMA | \ | 6439 | SD_NUMA | \ |
6389 | SD_ASYM_PACKING | \ | 6440 | SD_ASYM_PACKING | \ |
6441 | SD_ASYM_CPUCAPACITY | \ | ||
6390 | SD_SHARE_POWERDOMAIN) | 6442 | SD_SHARE_POWERDOMAIN) |
6391 | 6443 | ||
6392 | static struct sched_domain * | 6444 | static struct sched_domain * |
6393 | sd_init(struct sched_domain_topology_level *tl, int cpu) | 6445 | sd_init(struct sched_domain_topology_level *tl, |
6446 | const struct cpumask *cpu_map, | ||
6447 | struct sched_domain *child, int cpu) | ||
6394 | { | 6448 | { |
6395 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | 6449 | struct sd_data *sdd = &tl->data; |
6396 | int sd_weight, sd_flags = 0; | 6450 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6451 | int sd_id, sd_weight, sd_flags = 0; | ||
6397 | 6452 | ||
6398 | #ifdef CONFIG_NUMA | 6453 | #ifdef CONFIG_NUMA |
6399 | /* | 6454 | /* |
@@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6442 | .smt_gain = 0, | 6497 | .smt_gain = 0, |
6443 | .max_newidle_lb_cost = 0, | 6498 | .max_newidle_lb_cost = 0, |
6444 | .next_decay_max_lb_cost = jiffies, | 6499 | .next_decay_max_lb_cost = jiffies, |
6500 | .child = child, | ||
6445 | #ifdef CONFIG_SCHED_DEBUG | 6501 | #ifdef CONFIG_SCHED_DEBUG |
6446 | .name = tl->name, | 6502 | .name = tl->name, |
6447 | #endif | 6503 | #endif |
6448 | }; | 6504 | }; |
6449 | 6505 | ||
6506 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6507 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6508 | |||
6450 | /* | 6509 | /* |
6451 | * Convert topological properties into behaviour. | 6510 | * Convert topological properties into behaviour. |
6452 | */ | 6511 | */ |
6453 | 6512 | ||
6513 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6514 | struct sched_domain *t = sd; | ||
6515 | |||
6516 | for_each_lower_domain(t) | ||
6517 | t->flags |= SD_BALANCE_WAKE; | ||
6518 | } | ||
6519 | |||
6454 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6520 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6455 | sd->flags |= SD_PREFER_SIBLING; | 6521 | sd->flags |= SD_PREFER_SIBLING; |
6456 | sd->imbalance_pct = 110; | 6522 | sd->imbalance_pct = 110; |
@@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6482 | sd->idle_idx = 1; | 6548 | sd->idle_idx = 1; |
6483 | } | 6549 | } |
6484 | 6550 | ||
6485 | sd->private = &tl->data; | 6551 | /* |
6552 | * For all levels sharing cache; connect a sched_domain_shared | ||
6553 | * instance. | ||
6554 | */ | ||
6555 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6556 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6557 | atomic_inc(&sd->shared->ref); | ||
6558 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6559 | } | ||
6560 | |||
6561 | sd->private = sdd; | ||
6486 | 6562 | ||
6487 | return sd; | 6563 | return sd; |
6488 | } | 6564 | } |
@@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology = | |||
6509 | 6585 | ||
6510 | void set_sched_topology(struct sched_domain_topology_level *tl) | 6586 | void set_sched_topology(struct sched_domain_topology_level *tl) |
6511 | { | 6587 | { |
6588 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6589 | return; | ||
6590 | |||
6512 | sched_domain_topology = tl; | 6591 | sched_domain_topology = tl; |
6513 | } | 6592 | } |
6514 | 6593 | ||
@@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6789 | if (!sdd->sd) | 6868 | if (!sdd->sd) |
6790 | return -ENOMEM; | 6869 | return -ENOMEM; |
6791 | 6870 | ||
6871 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6872 | if (!sdd->sds) | ||
6873 | return -ENOMEM; | ||
6874 | |||
6792 | sdd->sg = alloc_percpu(struct sched_group *); | 6875 | sdd->sg = alloc_percpu(struct sched_group *); |
6793 | if (!sdd->sg) | 6876 | if (!sdd->sg) |
6794 | return -ENOMEM; | 6877 | return -ENOMEM; |
@@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6799 | 6882 | ||
6800 | for_each_cpu(j, cpu_map) { | 6883 | for_each_cpu(j, cpu_map) { |
6801 | struct sched_domain *sd; | 6884 | struct sched_domain *sd; |
6885 | struct sched_domain_shared *sds; | ||
6802 | struct sched_group *sg; | 6886 | struct sched_group *sg; |
6803 | struct sched_group_capacity *sgc; | 6887 | struct sched_group_capacity *sgc; |
6804 | 6888 | ||
@@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6809 | 6893 | ||
6810 | *per_cpu_ptr(sdd->sd, j) = sd; | 6894 | *per_cpu_ptr(sdd->sd, j) = sd; |
6811 | 6895 | ||
6896 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6897 | GFP_KERNEL, cpu_to_node(j)); | ||
6898 | if (!sds) | ||
6899 | return -ENOMEM; | ||
6900 | |||
6901 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6902 | |||
6812 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6903 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6813 | GFP_KERNEL, cpu_to_node(j)); | 6904 | GFP_KERNEL, cpu_to_node(j)); |
6814 | if (!sg) | 6905 | if (!sg) |
@@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6848 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6939 | kfree(*per_cpu_ptr(sdd->sd, j)); |
6849 | } | 6940 | } |
6850 | 6941 | ||
6942 | if (sdd->sds) | ||
6943 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
6851 | if (sdd->sg) | 6944 | if (sdd->sg) |
6852 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6945 | kfree(*per_cpu_ptr(sdd->sg, j)); |
6853 | if (sdd->sgc) | 6946 | if (sdd->sgc) |
@@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6855 | } | 6948 | } |
6856 | free_percpu(sdd->sd); | 6949 | free_percpu(sdd->sd); |
6857 | sdd->sd = NULL; | 6950 | sdd->sd = NULL; |
6951 | free_percpu(sdd->sds); | ||
6952 | sdd->sds = NULL; | ||
6858 | free_percpu(sdd->sg); | 6953 | free_percpu(sdd->sg); |
6859 | sdd->sg = NULL; | 6954 | sdd->sg = NULL; |
6860 | free_percpu(sdd->sgc); | 6955 | free_percpu(sdd->sgc); |
@@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6866 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 6961 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6867 | struct sched_domain *child, int cpu) | 6962 | struct sched_domain *child, int cpu) |
6868 | { | 6963 | { |
6869 | struct sched_domain *sd = sd_init(tl, cpu); | 6964 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); |
6870 | if (!sd) | ||
6871 | return child; | ||
6872 | 6965 | ||
6873 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6874 | if (child) { | 6966 | if (child) { |
6875 | sd->level = child->level + 1; | 6967 | sd->level = child->level + 1; |
6876 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6968 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6877 | child->parent = sd; | 6969 | child->parent = sd; |
6878 | sd->child = child; | ||
6879 | 6970 | ||
6880 | if (!cpumask_subset(sched_domain_span(child), | 6971 | if (!cpumask_subset(sched_domain_span(child), |
6881 | sched_domain_span(sd))) { | 6972 | sched_domain_span(sd))) { |
@@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6906 | enum s_alloc alloc_state; | 6997 | enum s_alloc alloc_state; |
6907 | struct sched_domain *sd; | 6998 | struct sched_domain *sd; |
6908 | struct s_data d; | 6999 | struct s_data d; |
7000 | struct rq *rq = NULL; | ||
6909 | int i, ret = -ENOMEM; | 7001 | int i, ret = -ENOMEM; |
6910 | 7002 | ||
6911 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7003 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
@@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6956 | /* Attach the domains */ | 7048 | /* Attach the domains */ |
6957 | rcu_read_lock(); | 7049 | rcu_read_lock(); |
6958 | for_each_cpu(i, cpu_map) { | 7050 | for_each_cpu(i, cpu_map) { |
7051 | rq = cpu_rq(i); | ||
6959 | sd = *per_cpu_ptr(d.sd, i); | 7052 | sd = *per_cpu_ptr(d.sd, i); |
7053 | |||
7054 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7055 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7056 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7057 | |||
6960 | cpu_attach_domain(sd, d.rd, i); | 7058 | cpu_attach_domain(sd, d.rd, i); |
6961 | } | 7059 | } |
6962 | rcu_read_unlock(); | 7060 | rcu_read_unlock(); |
6963 | 7061 | ||
7062 | if (rq && sched_debug_enabled) { | ||
7063 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7064 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7065 | } | ||
7066 | |||
6964 | ret = 0; | 7067 | ret = 0; |
6965 | error: | 7068 | error: |
6966 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7069 | __free_domain_allocs(&d, alloc_state, cpu_map); |
@@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu) | |||
7319 | } | 7422 | } |
7320 | #endif | 7423 | #endif |
7321 | 7424 | ||
7425 | #ifdef CONFIG_SCHED_SMT | ||
7426 | DEFINE_STATIC_KEY_FALSE(sched_smt_present); | ||
7427 | |||
7428 | static void sched_init_smt(void) | ||
7429 | { | ||
7430 | /* | ||
7431 | * We've enumerated all CPUs and will assume that if any CPU | ||
7432 | * has SMT siblings, CPU0 will too. | ||
7433 | */ | ||
7434 | if (cpumask_weight(cpu_smt_mask(0)) > 1) | ||
7435 | static_branch_enable(&sched_smt_present); | ||
7436 | } | ||
7437 | #else | ||
7438 | static inline void sched_init_smt(void) { } | ||
7439 | #endif | ||
7440 | |||
7322 | void __init sched_init_smp(void) | 7441 | void __init sched_init_smp(void) |
7323 | { | 7442 | { |
7324 | cpumask_var_t non_isolated_cpus; | 7443 | cpumask_var_t non_isolated_cpus; |
@@ -7348,6 +7467,9 @@ void __init sched_init_smp(void) | |||
7348 | 7467 | ||
7349 | init_sched_rt_class(); | 7468 | init_sched_rt_class(); |
7350 | init_sched_dl_class(); | 7469 | init_sched_dl_class(); |
7470 | |||
7471 | sched_init_smt(); | ||
7472 | |||
7351 | sched_smp_initialized = true; | 7473 | sched_smp_initialized = true; |
7352 | } | 7474 | } |
7353 | 7475 | ||
@@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly; | |||
7385 | #endif | 7507 | #endif |
7386 | 7508 | ||
7387 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); | 7509 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
7510 | DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
7511 | |||
7512 | #define WAIT_TABLE_BITS 8 | ||
7513 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | ||
7514 | static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; | ||
7515 | |||
7516 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
7517 | { | ||
7518 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
7519 | unsigned long val = (unsigned long)word << shift | bit; | ||
7520 | |||
7521 | return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); | ||
7522 | } | ||
7523 | EXPORT_SYMBOL(bit_waitqueue); | ||
7388 | 7524 | ||
7389 | void __init sched_init(void) | 7525 | void __init sched_init(void) |
7390 | { | 7526 | { |
7391 | int i, j; | 7527 | int i, j; |
7392 | unsigned long alloc_size = 0, ptr; | 7528 | unsigned long alloc_size = 0, ptr; |
7393 | 7529 | ||
7530 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | ||
7531 | init_waitqueue_head(bit_wait_table + i); | ||
7532 | |||
7394 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7533 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7395 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7534 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
7396 | #endif | 7535 | #endif |
@@ -7421,6 +7560,8 @@ void __init sched_init(void) | |||
7421 | for_each_possible_cpu(i) { | 7560 | for_each_possible_cpu(i) { |
7422 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( | 7561 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( |
7423 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | 7562 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); |
7563 | per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( | ||
7564 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | ||
7424 | } | 7565 | } |
7425 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 7566 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7426 | 7567 | ||
@@ -7523,10 +7664,6 @@ void __init sched_init(void) | |||
7523 | 7664 | ||
7524 | set_load_weight(&init_task); | 7665 | set_load_weight(&init_task); |
7525 | 7666 | ||
7526 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
7527 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | ||
7528 | #endif | ||
7529 | |||
7530 | /* | 7667 | /* |
7531 | * The boot idle thread does lazy MMU switching as well: | 7668 | * The boot idle thread does lazy MMU switching as well: |
7532 | */ | 7669 | */ |
@@ -7534,11 +7671,6 @@ void __init sched_init(void) | |||
7534 | enter_lazy_tlb(&init_mm, current); | 7671 | enter_lazy_tlb(&init_mm, current); |
7535 | 7672 | ||
7536 | /* | 7673 | /* |
7537 | * During early bootup we pretend to be a normal task: | ||
7538 | */ | ||
7539 | current->sched_class = &fair_sched_class; | ||
7540 | |||
7541 | /* | ||
7542 | * Make us the idle thread. Technically, schedule() should not be | 7674 | * Make us the idle thread. Technically, schedule() should not be |
7543 | * called from this thread, however somewhere below it might be, | 7675 | * called from this thread, however somewhere below it might be, |
7544 | * but because we are the idle thread, we just pick up running again | 7676 | * but because we are the idle thread, we just pick up running again |
@@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep); | |||
7592 | void ___might_sleep(const char *file, int line, int preempt_offset) | 7724 | void ___might_sleep(const char *file, int line, int preempt_offset) |
7593 | { | 7725 | { |
7594 | static unsigned long prev_jiffy; /* ratelimiting */ | 7726 | static unsigned long prev_jiffy; /* ratelimiting */ |
7727 | unsigned long preempt_disable_ip; | ||
7595 | 7728 | ||
7596 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7729 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
7597 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 7730 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
@@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7602 | return; | 7735 | return; |
7603 | prev_jiffy = jiffies; | 7736 | prev_jiffy = jiffies; |
7604 | 7737 | ||
7738 | /* Save this before calling printk(), since that will clobber it */ | ||
7739 | preempt_disable_ip = get_preempt_disable_ip(current); | ||
7740 | |||
7605 | printk(KERN_ERR | 7741 | printk(KERN_ERR |
7606 | "BUG: sleeping function called from invalid context at %s:%d\n", | 7742 | "BUG: sleeping function called from invalid context at %s:%d\n", |
7607 | file, line); | 7743 | file, line); |
@@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7616 | debug_show_held_locks(current); | 7752 | debug_show_held_locks(current); |
7617 | if (irqs_disabled()) | 7753 | if (irqs_disabled()) |
7618 | print_irqtrace_events(current); | 7754 | print_irqtrace_events(current); |
7619 | #ifdef CONFIG_DEBUG_PREEMPT | 7755 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
7620 | if (!preempt_count_equals(preempt_offset)) { | 7756 | && !preempt_count_equals(preempt_offset)) { |
7621 | pr_err("Preemption disabled at:"); | 7757 | pr_err("Preemption disabled at:"); |
7622 | print_ip_sym(current->preempt_disable_ip); | 7758 | print_ip_sym(preempt_disable_ip); |
7623 | pr_cont("\n"); | 7759 | pr_cont("\n"); |
7624 | } | 7760 | } |
7625 | #endif | ||
7626 | dump_stack(); | 7761 | dump_stack(); |
7762 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | ||
7627 | } | 7763 | } |
7628 | EXPORT_SYMBOL(___might_sleep); | 7764 | EXPORT_SYMBOL(___might_sleep); |
7629 | #endif | 7765 | #endif |
@@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void) | |||
7644 | if (p->flags & PF_KTHREAD) | 7780 | if (p->flags & PF_KTHREAD) |
7645 | continue; | 7781 | continue; |
7646 | 7782 | ||
7647 | p->se.exec_start = 0; | 7783 | p->se.exec_start = 0; |
7648 | #ifdef CONFIG_SCHEDSTATS | 7784 | schedstat_set(p->se.statistics.wait_start, 0); |
7649 | p->se.statistics.wait_start = 0; | 7785 | schedstat_set(p->se.statistics.sleep_start, 0); |
7650 | p->se.statistics.sleep_start = 0; | 7786 | schedstat_set(p->se.statistics.block_start, 0); |
7651 | p->se.statistics.block_start = 0; | ||
7652 | #endif | ||
7653 | 7787 | ||
7654 | if (!dl_task(p) && !rt_task(p)) { | 7788 | if (!dl_task(p) && !rt_task(p)) { |
7655 | /* | 7789 | /* |
@@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu) | |||
7710 | * | 7844 | * |
7711 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 7845 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7712 | */ | 7846 | */ |
7713 | void set_curr_task(int cpu, struct task_struct *p) | 7847 | void ia64_set_curr_task(int cpu, struct task_struct *p) |
7714 | { | 7848 | { |
7715 | cpu_curr(cpu) = p; | 7849 | cpu_curr(cpu) = p; |
7716 | } | 7850 | } |
@@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk) | |||
7841 | 7975 | ||
7842 | sched_change_group(tsk, TASK_MOVE_GROUP); | 7976 | sched_change_group(tsk, TASK_MOVE_GROUP); |
7843 | 7977 | ||
7844 | if (unlikely(running)) | ||
7845 | tsk->sched_class->set_curr_task(rq); | ||
7846 | if (queued) | 7978 | if (queued) |
7847 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 7979 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7980 | if (unlikely(running)) | ||
7981 | set_curr_task(rq, tsk); | ||
7848 | 7982 | ||
7849 | task_rq_unlock(rq, tsk, &rf); | 7983 | task_rq_unlock(rq, tsk, &rf); |
7850 | } | 7984 | } |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -31,56 +31,81 @@ static inline int right_child(int i) | |||
31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
32 | } | 32 | } |
33 | 33 | ||
34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_heapify_down(struct cpudl *cp, int idx) |
35 | { | 35 | { |
36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int l, r, largest; |
37 | 37 | ||
38 | swap(cp->elements[a].cpu, cp->elements[b].cpu); | 38 | int orig_cpu = cp->elements[idx].cpu; |
39 | swap(cp->elements[a].dl , cp->elements[b].dl ); | 39 | u64 orig_dl = cp->elements[idx].dl; |
40 | 40 | ||
41 | swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); | 41 | if (left_child(idx) >= cp->size) |
42 | } | 42 | return; |
43 | |||
44 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
45 | { | ||
46 | int l, r, largest; | ||
47 | 43 | ||
48 | /* adapted from lib/prio_heap.c */ | 44 | /* adapted from lib/prio_heap.c */ |
49 | while(1) { | 45 | while(1) { |
46 | u64 largest_dl; | ||
50 | l = left_child(idx); | 47 | l = left_child(idx); |
51 | r = right_child(idx); | 48 | r = right_child(idx); |
52 | largest = idx; | 49 | largest = idx; |
50 | largest_dl = orig_dl; | ||
53 | 51 | ||
54 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | 52 | if ((l < cp->size) && dl_time_before(orig_dl, |
55 | cp->elements[l].dl)) | 53 | cp->elements[l].dl)) { |
56 | largest = l; | 54 | largest = l; |
57 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | 55 | largest_dl = cp->elements[l].dl; |
58 | cp->elements[r].dl)) | 56 | } |
57 | if ((r < cp->size) && dl_time_before(largest_dl, | ||
58 | cp->elements[r].dl)) | ||
59 | largest = r; | 59 | largest = r; |
60 | |||
60 | if (largest == idx) | 61 | if (largest == idx) |
61 | break; | 62 | break; |
62 | 63 | ||
63 | /* Push idx down the heap one level and bump one up */ | 64 | /* pull largest child onto idx */ |
64 | cpudl_exchange(cp, largest, idx); | 65 | cp->elements[idx].cpu = cp->elements[largest].cpu; |
66 | cp->elements[idx].dl = cp->elements[largest].dl; | ||
67 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
65 | idx = largest; | 68 | idx = largest; |
66 | } | 69 | } |
70 | /* actual push down of saved original values orig_* */ | ||
71 | cp->elements[idx].cpu = orig_cpu; | ||
72 | cp->elements[idx].dl = orig_dl; | ||
73 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
67 | } | 74 | } |
68 | 75 | ||
69 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | 76 | static void cpudl_heapify_up(struct cpudl *cp, int idx) |
70 | { | 77 | { |
71 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); | 78 | int p; |
72 | 79 | ||
73 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | 80 | int orig_cpu = cp->elements[idx].cpu; |
74 | cp->elements[idx].dl = new_dl; | 81 | u64 orig_dl = cp->elements[idx].dl; |
75 | cpudl_heapify(cp, idx); | 82 | |
76 | } else { | 83 | if (idx == 0) |
77 | cp->elements[idx].dl = new_dl; | 84 | return; |
78 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | 85 | |
79 | cp->elements[idx].dl)) { | 86 | do { |
80 | cpudl_exchange(cp, idx, parent(idx)); | 87 | p = parent(idx); |
81 | idx = parent(idx); | 88 | if (dl_time_before(orig_dl, cp->elements[p].dl)) |
82 | } | 89 | break; |
83 | } | 90 | /* pull parent onto idx */ |
91 | cp->elements[idx].cpu = cp->elements[p].cpu; | ||
92 | cp->elements[idx].dl = cp->elements[p].dl; | ||
93 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
94 | idx = p; | ||
95 | } while (idx != 0); | ||
96 | /* actual push up of saved original values orig_* */ | ||
97 | cp->elements[idx].cpu = orig_cpu; | ||
98 | cp->elements[idx].dl = orig_dl; | ||
99 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
100 | } | ||
101 | |||
102 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
103 | { | ||
104 | if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
105 | cp->elements[idx].dl)) | ||
106 | cpudl_heapify_up(cp, idx); | ||
107 | else | ||
108 | cpudl_heapify_down(cp, idx); | ||
84 | } | 109 | } |
85 | 110 | ||
86 | static inline int cpudl_maximum(struct cpudl *cp) | 111 | static inline int cpudl_maximum(struct cpudl *cp) |
@@ -120,16 +145,15 @@ out: | |||
120 | } | 145 | } |
121 | 146 | ||
122 | /* | 147 | /* |
123 | * cpudl_set - update the cpudl max-heap | 148 | * cpudl_clear - remove a cpu from the cpudl max-heap |
124 | * @cp: the cpudl max-heap context | 149 | * @cp: the cpudl max-heap context |
125 | * @cpu: the target cpu | 150 | * @cpu: the target cpu |
126 | * @dl: the new earliest deadline for this cpu | ||
127 | * | 151 | * |
128 | * Notes: assumes cpu_rq(cpu)->lock is locked | 152 | * Notes: assumes cpu_rq(cpu)->lock is locked |
129 | * | 153 | * |
130 | * Returns: (void) | 154 | * Returns: (void) |
131 | */ | 155 | */ |
132 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | 156 | void cpudl_clear(struct cpudl *cp, int cpu) |
133 | { | 157 | { |
134 | int old_idx, new_cpu; | 158 | int old_idx, new_cpu; |
135 | unsigned long flags; | 159 | unsigned long flags; |
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
137 | WARN_ON(!cpu_present(cpu)); | 161 | WARN_ON(!cpu_present(cpu)); |
138 | 162 | ||
139 | raw_spin_lock_irqsave(&cp->lock, flags); | 163 | raw_spin_lock_irqsave(&cp->lock, flags); |
164 | |||
140 | old_idx = cp->elements[cpu].idx; | 165 | old_idx = cp->elements[cpu].idx; |
141 | if (!is_valid) { | 166 | if (old_idx == IDX_INVALID) { |
142 | /* remove item */ | 167 | /* |
143 | if (old_idx == IDX_INVALID) { | 168 | * Nothing to remove if old_idx was invalid. |
144 | /* | 169 | * This could happen if a rq_offline_dl is |
145 | * Nothing to remove if old_idx was invalid. | 170 | * called for a CPU without -dl tasks running. |
146 | * This could happen if a rq_offline_dl is | 171 | */ |
147 | * called for a CPU without -dl tasks running. | 172 | } else { |
148 | */ | ||
149 | goto out; | ||
150 | } | ||
151 | new_cpu = cp->elements[cp->size - 1].cpu; | 173 | new_cpu = cp->elements[cp->size - 1].cpu; |
152 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | 174 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; |
153 | cp->elements[old_idx].cpu = new_cpu; | 175 | cp->elements[old_idx].cpu = new_cpu; |
154 | cp->size--; | 176 | cp->size--; |
155 | cp->elements[new_cpu].idx = old_idx; | 177 | cp->elements[new_cpu].idx = old_idx; |
156 | cp->elements[cpu].idx = IDX_INVALID; | 178 | cp->elements[cpu].idx = IDX_INVALID; |
157 | while (old_idx > 0 && dl_time_before( | 179 | cpudl_heapify(cp, old_idx); |
158 | cp->elements[parent(old_idx)].dl, | ||
159 | cp->elements[old_idx].dl)) { | ||
160 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
161 | old_idx = parent(old_idx); | ||
162 | } | ||
163 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
164 | cpudl_heapify(cp, old_idx); | ||
165 | 180 | ||
166 | goto out; | 181 | cpumask_set_cpu(cpu, cp->free_cpus); |
167 | } | 182 | } |
183 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * cpudl_set - update the cpudl max-heap | ||
188 | * @cp: the cpudl max-heap context | ||
189 | * @cpu: the target cpu | ||
190 | * @dl: the new earliest deadline for this cpu | ||
191 | * | ||
192 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
193 | * | ||
194 | * Returns: (void) | ||
195 | */ | ||
196 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | ||
197 | { | ||
198 | int old_idx; | ||
199 | unsigned long flags; | ||
168 | 200 | ||
201 | WARN_ON(!cpu_present(cpu)); | ||
202 | |||
203 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
204 | |||
205 | old_idx = cp->elements[cpu].idx; | ||
169 | if (old_idx == IDX_INVALID) { | 206 | if (old_idx == IDX_INVALID) { |
170 | cp->size++; | 207 | int new_idx = cp->size++; |
171 | cp->elements[cp->size - 1].dl = dl; | 208 | cp->elements[new_idx].dl = dl; |
172 | cp->elements[cp->size - 1].cpu = cpu; | 209 | cp->elements[new_idx].cpu = cpu; |
173 | cp->elements[cpu].idx = cp->size - 1; | 210 | cp->elements[cpu].idx = new_idx; |
174 | cpudl_change_key(cp, cp->size - 1, dl); | 211 | cpudl_heapify_up(cp, new_idx); |
175 | cpumask_clear_cpu(cpu, cp->free_cpus); | 212 | cpumask_clear_cpu(cpu, cp->free_cpus); |
176 | } else { | 213 | } else { |
177 | cpudl_change_key(cp, old_idx, dl); | 214 | cp->elements[old_idx].dl = dl; |
215 | cpudl_heapify(cp, old_idx); | ||
178 | } | 216 | } |
179 | 217 | ||
180 | out: | ||
181 | raw_spin_unlock_irqrestore(&cp->lock, flags); | 218 | raw_spin_unlock_irqrestore(&cp->lock, flags); |
182 | } | 219 | } |
183 | 220 | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -23,7 +23,8 @@ struct cpudl { | |||
23 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, |
25 | struct cpumask *later_mask); | 25 | struct cpumask *later_mask); |
26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
27 | void cpudl_clear(struct cpudl *cp, int cpu); | ||
27 | int cpudl_init(struct cpudl *cp); | 28 | int cpudl_init(struct cpudl *cp); |
28 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 29 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
29 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 30 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954e73b4..dbc51442ecbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
33 | */ | 33 | */ |
34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, | 34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, |
35 | void (*func)(struct update_util_data *data, u64 time, | 35 | void (*func)(struct update_util_data *data, u64 time, |
36 | unsigned long util, unsigned long max)) | 36 | unsigned int flags)) |
37 | { | 37 | { |
38 | if (WARN_ON(!data || !func)) | 38 | if (WARN_ON(!data || !func)) |
39 | return; | 39 | return; |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b222c1..69e06898997d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | 13 | ||
14 | #include <linux/cpufreq.h> | 14 | #include <linux/cpufreq.h> |
15 | #include <linux/module.h> | ||
16 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
17 | #include <trace/events/power.h> | 16 | #include <trace/events/power.h> |
18 | 17 | ||
@@ -48,11 +47,14 @@ struct sugov_cpu { | |||
48 | struct sugov_policy *sg_policy; | 47 | struct sugov_policy *sg_policy; |
49 | 48 | ||
50 | unsigned int cached_raw_freq; | 49 | unsigned int cached_raw_freq; |
50 | unsigned long iowait_boost; | ||
51 | unsigned long iowait_boost_max; | ||
52 | u64 last_update; | ||
51 | 53 | ||
52 | /* The fields below are only needed when sharing a policy. */ | 54 | /* The fields below are only needed when sharing a policy. */ |
53 | unsigned long util; | 55 | unsigned long util; |
54 | unsigned long max; | 56 | unsigned long max; |
55 | u64 last_update; | 57 | unsigned int flags; |
56 | }; | 58 | }; |
57 | 59 | ||
58 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); | 60 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); |
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, | |||
144 | return cpufreq_driver_resolve_freq(policy, freq); | 146 | return cpufreq_driver_resolve_freq(policy, freq); |
145 | } | 147 | } |
146 | 148 | ||
149 | static void sugov_get_util(unsigned long *util, unsigned long *max) | ||
150 | { | ||
151 | struct rq *rq = this_rq(); | ||
152 | unsigned long cfs_max; | ||
153 | |||
154 | cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); | ||
155 | |||
156 | *util = min(rq->cfs.avg.util_avg, cfs_max); | ||
157 | *max = cfs_max; | ||
158 | } | ||
159 | |||
160 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, | ||
161 | unsigned int flags) | ||
162 | { | ||
163 | if (flags & SCHED_CPUFREQ_IOWAIT) { | ||
164 | sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; | ||
165 | } else if (sg_cpu->iowait_boost) { | ||
166 | s64 delta_ns = time - sg_cpu->last_update; | ||
167 | |||
168 | /* Clear iowait_boost if the CPU apprears to have been idle. */ | ||
169 | if (delta_ns > TICK_NSEC) | ||
170 | sg_cpu->iowait_boost = 0; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, | ||
175 | unsigned long *max) | ||
176 | { | ||
177 | unsigned long boost_util = sg_cpu->iowait_boost; | ||
178 | unsigned long boost_max = sg_cpu->iowait_boost_max; | ||
179 | |||
180 | if (!boost_util) | ||
181 | return; | ||
182 | |||
183 | if (*util * boost_max < *max * boost_util) { | ||
184 | *util = boost_util; | ||
185 | *max = boost_max; | ||
186 | } | ||
187 | sg_cpu->iowait_boost >>= 1; | ||
188 | } | ||
189 | |||
147 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 190 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
148 | unsigned long util, unsigned long max) | 191 | unsigned int flags) |
149 | { | 192 | { |
150 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 193 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
151 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 194 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
152 | struct cpufreq_policy *policy = sg_policy->policy; | 195 | struct cpufreq_policy *policy = sg_policy->policy; |
196 | unsigned long util, max; | ||
153 | unsigned int next_f; | 197 | unsigned int next_f; |
154 | 198 | ||
199 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
200 | sg_cpu->last_update = time; | ||
201 | |||
155 | if (!sugov_should_update_freq(sg_policy, time)) | 202 | if (!sugov_should_update_freq(sg_policy, time)) |
156 | return; | 203 | return; |
157 | 204 | ||
158 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : | 205 | if (flags & SCHED_CPUFREQ_RT_DL) { |
159 | get_next_freq(sg_cpu, util, max); | 206 | next_f = policy->cpuinfo.max_freq; |
207 | } else { | ||
208 | sugov_get_util(&util, &max); | ||
209 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
210 | next_f = get_next_freq(sg_cpu, util, max); | ||
211 | } | ||
160 | sugov_update_commit(sg_policy, time, next_f); | 212 | sugov_update_commit(sg_policy, time, next_f); |
161 | } | 213 | } |
162 | 214 | ||
163 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | 215 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, |
164 | unsigned long util, unsigned long max) | 216 | unsigned long util, unsigned long max, |
217 | unsigned int flags) | ||
165 | { | 218 | { |
166 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 219 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
167 | struct cpufreq_policy *policy = sg_policy->policy; | 220 | struct cpufreq_policy *policy = sg_policy->policy; |
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
169 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | 222 | u64 last_freq_update_time = sg_policy->last_freq_update_time; |
170 | unsigned int j; | 223 | unsigned int j; |
171 | 224 | ||
172 | if (util == ULONG_MAX) | 225 | if (flags & SCHED_CPUFREQ_RT_DL) |
173 | return max_f; | 226 | return max_f; |
174 | 227 | ||
228 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
229 | |||
175 | for_each_cpu(j, policy->cpus) { | 230 | for_each_cpu(j, policy->cpus) { |
176 | struct sugov_cpu *j_sg_cpu; | 231 | struct sugov_cpu *j_sg_cpu; |
177 | unsigned long j_util, j_max; | 232 | unsigned long j_util, j_max; |
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
186 | * frequency update and the time elapsed between the last update | 241 | * frequency update and the time elapsed between the last update |
187 | * of the CPU utilization and the last frequency update is long | 242 | * of the CPU utilization and the last frequency update is long |
188 | * enough, don't take the CPU into account as it probably is | 243 | * enough, don't take the CPU into account as it probably is |
189 | * idle now. | 244 | * idle now (and clear iowait_boost for it). |
190 | */ | 245 | */ |
191 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; | 246 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; |
192 | if (delta_ns > TICK_NSEC) | 247 | if (delta_ns > TICK_NSEC) { |
248 | j_sg_cpu->iowait_boost = 0; | ||
193 | continue; | 249 | continue; |
194 | 250 | } | |
195 | j_util = j_sg_cpu->util; | 251 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) |
196 | if (j_util == ULONG_MAX) | ||
197 | return max_f; | 252 | return max_f; |
198 | 253 | ||
254 | j_util = j_sg_cpu->util; | ||
199 | j_max = j_sg_cpu->max; | 255 | j_max = j_sg_cpu->max; |
200 | if (j_util * max > j_max * util) { | 256 | if (j_util * max > j_max * util) { |
201 | util = j_util; | 257 | util = j_util; |
202 | max = j_max; | 258 | max = j_max; |
203 | } | 259 | } |
260 | |||
261 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
204 | } | 262 | } |
205 | 263 | ||
206 | return get_next_freq(sg_cpu, util, max); | 264 | return get_next_freq(sg_cpu, util, max); |
207 | } | 265 | } |
208 | 266 | ||
209 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 267 | static void sugov_update_shared(struct update_util_data *hook, u64 time, |
210 | unsigned long util, unsigned long max) | 268 | unsigned int flags) |
211 | { | 269 | { |
212 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 270 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
213 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 271 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
272 | unsigned long util, max; | ||
214 | unsigned int next_f; | 273 | unsigned int next_f; |
215 | 274 | ||
275 | sugov_get_util(&util, &max); | ||
276 | |||
216 | raw_spin_lock(&sg_policy->update_lock); | 277 | raw_spin_lock(&sg_policy->update_lock); |
217 | 278 | ||
218 | sg_cpu->util = util; | 279 | sg_cpu->util = util; |
219 | sg_cpu->max = max; | 280 | sg_cpu->max = max; |
281 | sg_cpu->flags = flags; | ||
282 | |||
283 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
220 | sg_cpu->last_update = time; | 284 | sg_cpu->last_update = time; |
221 | 285 | ||
222 | if (sugov_should_update_freq(sg_policy, time)) { | 286 | if (sugov_should_update_freq(sg_policy, time)) { |
223 | next_f = sugov_next_freq_shared(sg_cpu, util, max); | 287 | next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); |
224 | sugov_update_commit(sg_policy, time, next_f); | 288 | sugov_update_commit(sg_policy, time, next_f); |
225 | } | 289 | } |
226 | 290 | ||
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
444 | 508 | ||
445 | sg_cpu->sg_policy = sg_policy; | 509 | sg_cpu->sg_policy = sg_policy; |
446 | if (policy_is_shared(policy)) { | 510 | if (policy_is_shared(policy)) { |
447 | sg_cpu->util = ULONG_MAX; | 511 | sg_cpu->util = 0; |
448 | sg_cpu->max = 0; | 512 | sg_cpu->max = 0; |
513 | sg_cpu->flags = SCHED_CPUFREQ_RT; | ||
449 | sg_cpu->last_update = 0; | 514 | sg_cpu->last_update = 0; |
450 | sg_cpu->cached_raw_freq = 0; | 515 | sg_cpu->cached_raw_freq = 0; |
516 | sg_cpu->iowait_boost = 0; | ||
517 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
451 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 518 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
452 | sugov_update_shared); | 519 | sugov_update_shared); |
453 | } else { | 520 | } else { |
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = { | |||
495 | .limits = sugov_limits, | 562 | .limits = sugov_limits, |
496 | }; | 563 | }; |
497 | 564 | ||
498 | static int __init sugov_module_init(void) | ||
499 | { | ||
500 | return cpufreq_register_governor(&schedutil_gov); | ||
501 | } | ||
502 | |||
503 | static void __exit sugov_module_exit(void) | ||
504 | { | ||
505 | cpufreq_unregister_governor(&schedutil_gov); | ||
506 | } | ||
507 | |||
508 | MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); | ||
509 | MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); | ||
510 | MODULE_LICENSE("GPL"); | ||
511 | |||
512 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 565 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
513 | struct cpufreq_governor *cpufreq_default_governor(void) | 566 | struct cpufreq_governor *cpufreq_default_governor(void) |
514 | { | 567 | { |
515 | return &schedutil_gov; | 568 | return &schedutil_gov; |
516 | } | 569 | } |
517 | |||
518 | fs_initcall(sugov_module_init); | ||
519 | #else | ||
520 | module_init(sugov_module_init); | ||
521 | #endif | 570 | #endif |
522 | module_exit(sugov_module_exit); | 571 | |
572 | static int __init sugov_register(void) | ||
573 | { | ||
574 | return cpufreq_register_governor(&schedutil_gov); | ||
575 | } | ||
576 | fs_initcall(sugov_register); | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..5ebee3164e64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -23,10 +23,8 @@ | |||
23 | * task when irq is in progress while we read rq->clock. That is a worthy | 23 | * task when irq is in progress while we read rq->clock. That is a worthy |
24 | * compromise in place of having locks on each irq in account_system_time. | 24 | * compromise in place of having locks on each irq in account_system_time. |
25 | */ | 25 | */ |
26 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | 26 | DEFINE_PER_CPU(struct irqtime, cpu_irqtime); |
27 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
28 | 27 | ||
29 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
30 | static int sched_clock_irqtime; | 28 | static int sched_clock_irqtime; |
31 | 29 | ||
32 | void enable_sched_clock_irqtime(void) | 30 | void enable_sched_clock_irqtime(void) |
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) | |||
39 | sched_clock_irqtime = 0; | 37 | sched_clock_irqtime = 0; |
40 | } | 38 | } |
41 | 39 | ||
42 | #ifndef CONFIG_64BIT | ||
43 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
44 | #endif /* CONFIG_64BIT */ | ||
45 | |||
46 | /* | 40 | /* |
47 | * Called before incrementing preempt_count on {soft,}irq_enter | 41 | * Called before incrementing preempt_count on {soft,}irq_enter |
48 | * and before decrementing preempt_count on {soft,}irq_exit. | 42 | * and before decrementing preempt_count on {soft,}irq_exit. |
49 | */ | 43 | */ |
50 | void irqtime_account_irq(struct task_struct *curr) | 44 | void irqtime_account_irq(struct task_struct *curr) |
51 | { | 45 | { |
46 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | ||
52 | s64 delta; | 47 | s64 delta; |
53 | int cpu; | 48 | int cpu; |
54 | 49 | ||
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) | |||
56 | return; | 51 | return; |
57 | 52 | ||
58 | cpu = smp_processor_id(); | 53 | cpu = smp_processor_id(); |
59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 54 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
60 | __this_cpu_add(irq_start_time, delta); | 55 | irqtime->irq_start_time += delta; |
61 | 56 | ||
62 | irq_time_write_begin(); | 57 | u64_stats_update_begin(&irqtime->sync); |
63 | /* | 58 | /* |
64 | * We do not account for softirq time from ksoftirqd here. | 59 | * We do not account for softirq time from ksoftirqd here. |
65 | * We want to continue accounting softirq time to ksoftirqd thread | 60 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) | |||
67 | * that do not consume any time, but still wants to run. | 62 | * that do not consume any time, but still wants to run. |
68 | */ | 63 | */ |
69 | if (hardirq_count()) | 64 | if (hardirq_count()) |
70 | __this_cpu_add(cpu_hardirq_time, delta); | 65 | irqtime->hardirq_time += delta; |
71 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 66 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
72 | __this_cpu_add(cpu_softirq_time, delta); | 67 | irqtime->softirq_time += delta; |
73 | 68 | ||
74 | irq_time_write_end(); | 69 | u64_stats_update_end(&irqtime->sync); |
75 | } | 70 | } |
76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 71 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
77 | 72 | ||
78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | 73 | static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) |
79 | { | 74 | { |
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 75 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
81 | unsigned long flags; | ||
82 | cputime_t irq_cputime; | 76 | cputime_t irq_cputime; |
83 | 77 | ||
84 | local_irq_save(flags); | 78 | irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; |
85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - | ||
86 | cpustat[CPUTIME_IRQ]; | ||
87 | irq_cputime = min(irq_cputime, maxtime); | 79 | irq_cputime = min(irq_cputime, maxtime); |
88 | cpustat[CPUTIME_IRQ] += irq_cputime; | 80 | cpustat[idx] += irq_cputime; |
89 | local_irq_restore(flags); | 81 | |
90 | return irq_cputime; | 82 | return irq_cputime; |
91 | } | 83 | } |
92 | 84 | ||
93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | 85 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) |
94 | { | 86 | { |
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 87 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), |
96 | unsigned long flags; | 88 | CPUTIME_IRQ, maxtime); |
97 | cputime_t softirq_cputime; | 89 | } |
98 | 90 | ||
99 | local_irq_save(flags); | 91 | static cputime_t irqtime_account_si_update(cputime_t maxtime) |
100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - | 92 | { |
101 | cpustat[CPUTIME_SOFTIRQ]; | 93 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), |
102 | softirq_cputime = min(softirq_cputime, maxtime); | 94 | CPUTIME_SOFTIRQ, maxtime); |
103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | ||
104 | local_irq_restore(flags); | ||
105 | return softirq_cputime; | ||
106 | } | 95 | } |
107 | 96 | ||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 97 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max) | |||
295 | { | 284 | { |
296 | cputime_t accounted; | 285 | cputime_t accounted; |
297 | 286 | ||
287 | /* Shall be converted to a lockdep-enabled lightweight check */ | ||
288 | WARN_ON_ONCE(!irqs_disabled()); | ||
289 | |||
298 | accounted = steal_account_process_time(max); | 290 | accounted = steal_account_process_time(max); |
299 | 291 | ||
300 | if (accounted < max) | 292 | if (accounted < max) |
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max) | |||
306 | return accounted; | 298 | return accounted; |
307 | } | 299 | } |
308 | 300 | ||
301 | #ifdef CONFIG_64BIT | ||
302 | static inline u64 read_sum_exec_runtime(struct task_struct *t) | ||
303 | { | ||
304 | return t->se.sum_exec_runtime; | ||
305 | } | ||
306 | #else | ||
307 | static u64 read_sum_exec_runtime(struct task_struct *t) | ||
308 | { | ||
309 | u64 ns; | ||
310 | struct rq_flags rf; | ||
311 | struct rq *rq; | ||
312 | |||
313 | rq = task_rq_lock(t, &rf); | ||
314 | ns = t->se.sum_exec_runtime; | ||
315 | task_rq_unlock(rq, t, &rf); | ||
316 | |||
317 | return ns; | ||
318 | } | ||
319 | #endif | ||
320 | |||
309 | /* | 321 | /* |
310 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | 322 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live |
311 | * tasks (sum on group iteration) belonging to @tsk's group. | 323 | * tasks (sum on group iteration) belonging to @tsk's group. |
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
318 | unsigned int seq, nextseq; | 330 | unsigned int seq, nextseq; |
319 | unsigned long flags; | 331 | unsigned long flags; |
320 | 332 | ||
333 | /* | ||
334 | * Update current task runtime to account pending time since last | ||
335 | * scheduler action or thread_group_cputime() call. This thread group | ||
336 | * might have other running tasks on different CPUs, but updating | ||
337 | * their runtime can affect syscall performance, so we skip account | ||
338 | * those pending times and rely only on values updated on tick or | ||
339 | * other scheduler action. | ||
340 | */ | ||
341 | if (same_thread_group(current, tsk)) | ||
342 | (void) task_sched_runtime(current); | ||
343 | |||
321 | rcu_read_lock(); | 344 | rcu_read_lock(); |
322 | /* Attempt a lockless read on the first round. */ | 345 | /* Attempt a lockless read on the first round. */ |
323 | nextseq = 0; | 346 | nextseq = 0; |
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
332 | task_cputime(t, &utime, &stime); | 355 | task_cputime(t, &utime, &stime); |
333 | times->utime += utime; | 356 | times->utime += utime; |
334 | times->stime += stime; | 357 | times->stime += stime; |
335 | times->sum_exec_runtime += task_sched_runtime(t); | 358 | times->sum_exec_runtime += read_sum_exec_runtime(t); |
336 | } | 359 | } |
337 | /* If lockless access failed, take the lock. */ | 360 | /* If lockless access failed, take the lock. */ |
338 | nextseq = 1; | 361 | nextseq = 1; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..37e2449186c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | |||
243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) | 243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) |
244 | { | 244 | { |
245 | struct rq *later_rq = NULL; | 245 | struct rq *later_rq = NULL; |
246 | bool fallback = false; | ||
247 | 246 | ||
248 | later_rq = find_lock_later_rq(p, rq); | 247 | later_rq = find_lock_later_rq(p, rq); |
249 | |||
250 | if (!later_rq) { | 248 | if (!later_rq) { |
251 | int cpu; | 249 | int cpu; |
252 | 250 | ||
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
254 | * If we cannot preempt any rq, fall back to pick any | 252 | * If we cannot preempt any rq, fall back to pick any |
255 | * online cpu. | 253 | * online cpu. |
256 | */ | 254 | */ |
257 | fallback = true; | ||
258 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | 255 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); |
259 | if (cpu >= nr_cpu_ids) { | 256 | if (cpu >= nr_cpu_ids) { |
260 | /* | 257 | /* |
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
274 | double_lock_balance(rq, later_rq); | 271 | double_lock_balance(rq, later_rq); |
275 | } | 272 | } |
276 | 273 | ||
277 | /* | ||
278 | * By now the task is replenished and enqueued; migrate it. | ||
279 | */ | ||
280 | deactivate_task(rq, p, 0); | ||
281 | set_task_cpu(p, later_rq->cpu); | 274 | set_task_cpu(p, later_rq->cpu); |
282 | activate_task(later_rq, p, 0); | ||
283 | |||
284 | if (!fallback) | ||
285 | resched_curr(later_rq); | ||
286 | |||
287 | double_unlock_balance(later_rq, rq); | 275 | double_unlock_balance(later_rq, rq); |
288 | 276 | ||
289 | return later_rq; | 277 | return later_rq; |
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
346 | * one, and to (try to!) reconcile itself with its own scheduling | 334 | * one, and to (try to!) reconcile itself with its own scheduling |
347 | * parameters. | 335 | * parameters. |
348 | */ | 336 | */ |
349 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | 337 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) |
350 | struct sched_dl_entity *pi_se) | ||
351 | { | 338 | { |
352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 339 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
353 | struct rq *rq = rq_of_dl_rq(dl_rq); | 340 | struct rq *rq = rq_of_dl_rq(dl_rq); |
354 | 341 | ||
342 | WARN_ON(dl_se->dl_boosted); | ||
355 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); | 343 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); |
356 | 344 | ||
357 | /* | 345 | /* |
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
367 | * future; in fact, we must consider execution overheads (time | 355 | * future; in fact, we must consider execution overheads (time |
368 | * spent on hardirq context, etc.). | 356 | * spent on hardirq context, etc.). |
369 | */ | 357 | */ |
370 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 358 | dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; |
371 | dl_se->runtime = pi_se->dl_runtime; | 359 | dl_se->runtime = dl_se->dl_runtime; |
372 | } | 360 | } |
373 | 361 | ||
374 | /* | 362 | /* |
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
641 | goto unlock; | 629 | goto unlock; |
642 | } | 630 | } |
643 | 631 | ||
644 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
645 | if (dl_task(rq->curr)) | ||
646 | check_preempt_curr_dl(rq, p, 0); | ||
647 | else | ||
648 | resched_curr(rq); | ||
649 | |||
650 | #ifdef CONFIG_SMP | 632 | #ifdef CONFIG_SMP |
651 | /* | ||
652 | * Perform balancing operations here; after the replenishments. We | ||
653 | * cannot drop rq->lock before this, otherwise the assertion in | ||
654 | * start_dl_timer() about not missing updates is not true. | ||
655 | * | ||
656 | * If we find that the rq the task was on is no longer available, we | ||
657 | * need to select a new rq. | ||
658 | * | ||
659 | * XXX figure out if select_task_rq_dl() deals with offline cpus. | ||
660 | */ | ||
661 | if (unlikely(!rq->online)) { | 633 | if (unlikely(!rq->online)) { |
634 | /* | ||
635 | * If the runqueue is no longer available, migrate the | ||
636 | * task elsewhere. This necessarily changes rq. | ||
637 | */ | ||
662 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 638 | lockdep_unpin_lock(&rq->lock, rf.cookie); |
663 | rq = dl_task_offline_migration(rq, p); | 639 | rq = dl_task_offline_migration(rq, p); |
664 | rf.cookie = lockdep_pin_lock(&rq->lock); | 640 | rf.cookie = lockdep_pin_lock(&rq->lock); |
641 | |||
642 | /* | ||
643 | * Now that the task has been migrated to the new RQ and we | ||
644 | * have that locked, proceed as normal and enqueue the task | ||
645 | * there. | ||
646 | */ | ||
665 | } | 647 | } |
648 | #endif | ||
649 | |||
650 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
651 | if (dl_task(rq->curr)) | ||
652 | check_preempt_curr_dl(rq, p, 0); | ||
653 | else | ||
654 | resched_curr(rq); | ||
666 | 655 | ||
656 | #ifdef CONFIG_SMP | ||
667 | /* | 657 | /* |
668 | * Queueing this task back might have overloaded rq, check if we need | 658 | * Queueing this task back might have overloaded rq, check if we need |
669 | * to kick someone away. | 659 | * to kick someone away. |
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq) | |||
735 | return; | 725 | return; |
736 | } | 726 | } |
737 | 727 | ||
738 | /* kick cpufreq (see the comment in linux/cpufreq.h). */ | 728 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
739 | if (cpu_of(rq) == smp_processor_id()) | 729 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); |
740 | cpufreq_trigger_update(rq_clock(rq)); | ||
741 | 730 | ||
742 | schedstat_set(curr->se.statistics.exec_max, | 731 | schedstat_set(curr->se.statistics.exec_max, |
743 | max(curr->se.statistics.exec_max, delta_exec)); | 732 | max(curr->se.statistics.exec_max, delta_exec)); |
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
798 | if (dl_rq->earliest_dl.curr == 0 || | 787 | if (dl_rq->earliest_dl.curr == 0 || |
799 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | 788 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { |
800 | dl_rq->earliest_dl.curr = deadline; | 789 | dl_rq->earliest_dl.curr = deadline; |
801 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | 790 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); |
802 | } | 791 | } |
803 | } | 792 | } |
804 | 793 | ||
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
813 | if (!dl_rq->dl_nr_running) { | 802 | if (!dl_rq->dl_nr_running) { |
814 | dl_rq->earliest_dl.curr = 0; | 803 | dl_rq->earliest_dl.curr = 0; |
815 | dl_rq->earliest_dl.next = 0; | 804 | dl_rq->earliest_dl.next = 0; |
816 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 805 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
817 | } else { | 806 | } else { |
818 | struct rb_node *leftmost = dl_rq->rb_leftmost; | 807 | struct rb_node *leftmost = dl_rq->rb_leftmost; |
819 | struct sched_dl_entity *entry; | 808 | struct sched_dl_entity *entry; |
820 | 809 | ||
821 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | 810 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); |
822 | dl_rq->earliest_dl.curr = entry->deadline; | 811 | dl_rq->earliest_dl.curr = entry->deadline; |
823 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | 812 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); |
824 | } | 813 | } |
825 | } | 814 | } |
826 | 815 | ||
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) | |||
1671 | 1660 | ||
1672 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); | 1661 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); |
1673 | if (rq->dl.dl_nr_running > 0) | 1662 | if (rq->dl.dl_nr_running > 0) |
1674 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | 1663 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); |
1675 | } | 1664 | } |
1676 | 1665 | ||
1677 | /* Assumes rq->lock is held */ | 1666 | /* Assumes rq->lock is held */ |
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) | |||
1680 | if (rq->dl.overloaded) | 1669 | if (rq->dl.overloaded) |
1681 | dl_clear_overload(rq); | 1670 | dl_clear_overload(rq); |
1682 | 1671 | ||
1683 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 1672 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
1684 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); | 1673 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); |
1685 | } | 1674 | } |
1686 | 1675 | ||
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
1723 | */ | 1712 | */ |
1724 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 1713 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
1725 | { | 1714 | { |
1715 | |||
1716 | /* If p is not queued we will update its parameters at next wakeup. */ | ||
1717 | if (!task_on_rq_queued(p)) | ||
1718 | return; | ||
1719 | |||
1720 | /* | ||
1721 | * If p is boosted we already updated its params in | ||
1722 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), | ||
1723 | * p's deadline being now already after rq_clock(rq). | ||
1724 | */ | ||
1726 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | 1725 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) |
1727 | setup_new_dl_entity(&p->dl, &p->dl); | 1726 | setup_new_dl_entity(&p->dl); |
1728 | 1727 | ||
1729 | if (task_on_rq_queued(p) && rq->curr != p) { | 1728 | if (rq->curr != p) { |
1730 | #ifdef CONFIG_SMP | 1729 | #ifdef CONFIG_SMP |
1731 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) | 1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) |
1732 | queue_push_tasks(rq); | 1731 | queue_push_tasks(rq); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..fa178b62ea79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
369 | 369 | ||
370 | #define P(F) \ | 370 | #define P(F) \ |
371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
372 | #define P_SCHEDSTAT(F) \ | ||
373 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | ||
372 | #define PN(F) \ | 374 | #define PN(F) \ |
373 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 375 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
376 | #define PN_SCHEDSTAT(F) \ | ||
377 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
374 | 378 | ||
375 | if (!se) | 379 | if (!se) |
376 | return; | 380 | return; |
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
378 | PN(se->exec_start); | 382 | PN(se->exec_start); |
379 | PN(se->vruntime); | 383 | PN(se->vruntime); |
380 | PN(se->sum_exec_runtime); | 384 | PN(se->sum_exec_runtime); |
381 | #ifdef CONFIG_SCHEDSTATS | ||
382 | if (schedstat_enabled()) { | 385 | if (schedstat_enabled()) { |
383 | PN(se->statistics.wait_start); | 386 | PN_SCHEDSTAT(se->statistics.wait_start); |
384 | PN(se->statistics.sleep_start); | 387 | PN_SCHEDSTAT(se->statistics.sleep_start); |
385 | PN(se->statistics.block_start); | 388 | PN_SCHEDSTAT(se->statistics.block_start); |
386 | PN(se->statistics.sleep_max); | 389 | PN_SCHEDSTAT(se->statistics.sleep_max); |
387 | PN(se->statistics.block_max); | 390 | PN_SCHEDSTAT(se->statistics.block_max); |
388 | PN(se->statistics.exec_max); | 391 | PN_SCHEDSTAT(se->statistics.exec_max); |
389 | PN(se->statistics.slice_max); | 392 | PN_SCHEDSTAT(se->statistics.slice_max); |
390 | PN(se->statistics.wait_max); | 393 | PN_SCHEDSTAT(se->statistics.wait_max); |
391 | PN(se->statistics.wait_sum); | 394 | PN_SCHEDSTAT(se->statistics.wait_sum); |
392 | P(se->statistics.wait_count); | 395 | P_SCHEDSTAT(se->statistics.wait_count); |
393 | } | 396 | } |
394 | #endif | ||
395 | P(se->load.weight); | 397 | P(se->load.weight); |
396 | #ifdef CONFIG_SMP | 398 | #ifdef CONFIG_SMP |
397 | P(se->avg.load_avg); | 399 | P(se->avg.load_avg); |
398 | P(se->avg.util_avg); | 400 | P(se->avg.util_avg); |
399 | #endif | 401 | #endif |
402 | |||
403 | #undef PN_SCHEDSTAT | ||
400 | #undef PN | 404 | #undef PN |
405 | #undef P_SCHEDSTAT | ||
401 | #undef P | 406 | #undef P |
402 | } | 407 | } |
403 | #endif | 408 | #endif |
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg) | |||
410 | if (autogroup_path(tg, group_path, PATH_MAX)) | 415 | if (autogroup_path(tg, group_path, PATH_MAX)) |
411 | return group_path; | 416 | return group_path; |
412 | 417 | ||
413 | return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 418 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
419 | return group_path; | ||
414 | } | 420 | } |
415 | #endif | 421 | #endif |
416 | 422 | ||
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
429 | p->prio); | 435 | p->prio); |
430 | 436 | ||
431 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 437 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
432 | SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), | 438 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), |
433 | SPLIT_NS(p->se.sum_exec_runtime), | 439 | SPLIT_NS(p->se.sum_exec_runtime), |
434 | SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); | 440 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); |
435 | 441 | ||
436 | #ifdef CONFIG_NUMA_BALANCING | 442 | #ifdef CONFIG_NUMA_BALANCING |
437 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); | 443 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); |
@@ -626,9 +632,7 @@ do { \ | |||
626 | #undef P64 | 632 | #undef P64 |
627 | #endif | 633 | #endif |
628 | 634 | ||
629 | #ifdef CONFIG_SCHEDSTATS | 635 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); |
630 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); | ||
631 | |||
632 | if (schedstat_enabled()) { | 636 | if (schedstat_enabled()) { |
633 | P(yld_count); | 637 | P(yld_count); |
634 | P(sched_count); | 638 | P(sched_count); |
@@ -636,9 +640,8 @@ do { \ | |||
636 | P(ttwu_count); | 640 | P(ttwu_count); |
637 | P(ttwu_local); | 641 | P(ttwu_local); |
638 | } | 642 | } |
639 | |||
640 | #undef P | 643 | #undef P |
641 | #endif | 644 | |
642 | spin_lock_irqsave(&sched_debug_lock, flags); | 645 | spin_lock_irqsave(&sched_debug_lock, flags); |
643 | print_cfs_stats(m, cpu); | 646 | print_cfs_stats(m, cpu); |
644 | print_rt_stats(m, cpu); | 647 | print_rt_stats(m, cpu); |
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
868 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 871 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
869 | #define P(F) \ | 872 | #define P(F) \ |
870 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 873 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
874 | #define P_SCHEDSTAT(F) \ | ||
875 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) | ||
871 | #define __PN(F) \ | 876 | #define __PN(F) \ |
872 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | 877 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
873 | #define PN(F) \ | 878 | #define PN(F) \ |
874 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | 879 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
880 | #define PN_SCHEDSTAT(F) \ | ||
881 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) | ||
875 | 882 | ||
876 | PN(se.exec_start); | 883 | PN(se.exec_start); |
877 | PN(se.vruntime); | 884 | PN(se.vruntime); |
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
881 | 888 | ||
882 | P(se.nr_migrations); | 889 | P(se.nr_migrations); |
883 | 890 | ||
884 | #ifdef CONFIG_SCHEDSTATS | ||
885 | if (schedstat_enabled()) { | 891 | if (schedstat_enabled()) { |
886 | u64 avg_atom, avg_per_cpu; | 892 | u64 avg_atom, avg_per_cpu; |
887 | 893 | ||
888 | PN(se.statistics.sum_sleep_runtime); | 894 | PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); |
889 | PN(se.statistics.wait_start); | 895 | PN_SCHEDSTAT(se.statistics.wait_start); |
890 | PN(se.statistics.sleep_start); | 896 | PN_SCHEDSTAT(se.statistics.sleep_start); |
891 | PN(se.statistics.block_start); | 897 | PN_SCHEDSTAT(se.statistics.block_start); |
892 | PN(se.statistics.sleep_max); | 898 | PN_SCHEDSTAT(se.statistics.sleep_max); |
893 | PN(se.statistics.block_max); | 899 | PN_SCHEDSTAT(se.statistics.block_max); |
894 | PN(se.statistics.exec_max); | 900 | PN_SCHEDSTAT(se.statistics.exec_max); |
895 | PN(se.statistics.slice_max); | 901 | PN_SCHEDSTAT(se.statistics.slice_max); |
896 | PN(se.statistics.wait_max); | 902 | PN_SCHEDSTAT(se.statistics.wait_max); |
897 | PN(se.statistics.wait_sum); | 903 | PN_SCHEDSTAT(se.statistics.wait_sum); |
898 | P(se.statistics.wait_count); | 904 | P_SCHEDSTAT(se.statistics.wait_count); |
899 | PN(se.statistics.iowait_sum); | 905 | PN_SCHEDSTAT(se.statistics.iowait_sum); |
900 | P(se.statistics.iowait_count); | 906 | P_SCHEDSTAT(se.statistics.iowait_count); |
901 | P(se.statistics.nr_migrations_cold); | 907 | P_SCHEDSTAT(se.statistics.nr_migrations_cold); |
902 | P(se.statistics.nr_failed_migrations_affine); | 908 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); |
903 | P(se.statistics.nr_failed_migrations_running); | 909 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); |
904 | P(se.statistics.nr_failed_migrations_hot); | 910 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); |
905 | P(se.statistics.nr_forced_migrations); | 911 | P_SCHEDSTAT(se.statistics.nr_forced_migrations); |
906 | P(se.statistics.nr_wakeups); | 912 | P_SCHEDSTAT(se.statistics.nr_wakeups); |
907 | P(se.statistics.nr_wakeups_sync); | 913 | P_SCHEDSTAT(se.statistics.nr_wakeups_sync); |
908 | P(se.statistics.nr_wakeups_migrate); | 914 | P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); |
909 | P(se.statistics.nr_wakeups_local); | 915 | P_SCHEDSTAT(se.statistics.nr_wakeups_local); |
910 | P(se.statistics.nr_wakeups_remote); | 916 | P_SCHEDSTAT(se.statistics.nr_wakeups_remote); |
911 | P(se.statistics.nr_wakeups_affine); | 917 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine); |
912 | P(se.statistics.nr_wakeups_affine_attempts); | 918 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); |
913 | P(se.statistics.nr_wakeups_passive); | 919 | P_SCHEDSTAT(se.statistics.nr_wakeups_passive); |
914 | P(se.statistics.nr_wakeups_idle); | 920 | P_SCHEDSTAT(se.statistics.nr_wakeups_idle); |
915 | 921 | ||
916 | avg_atom = p->se.sum_exec_runtime; | 922 | avg_atom = p->se.sum_exec_runtime; |
917 | if (nr_switches) | 923 | if (nr_switches) |
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
930 | __PN(avg_atom); | 936 | __PN(avg_atom); |
931 | __PN(avg_per_cpu); | 937 | __PN(avg_per_cpu); |
932 | } | 938 | } |
933 | #endif | 939 | |
934 | __P(nr_switches); | 940 | __P(nr_switches); |
935 | SEQ_printf(m, "%-45s:%21Ld\n", | 941 | SEQ_printf(m, "%-45s:%21Ld\n", |
936 | "nr_voluntary_switches", (long long)p->nvcsw); | 942 | "nr_voluntary_switches", (long long)p->nvcsw); |
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
947 | #endif | 953 | #endif |
948 | P(policy); | 954 | P(policy); |
949 | P(prio); | 955 | P(prio); |
956 | #undef PN_SCHEDSTAT | ||
950 | #undef PN | 957 | #undef PN |
951 | #undef __PN | 958 | #undef __PN |
959 | #undef P_SCHEDSTAT | ||
952 | #undef P | 960 | #undef P |
953 | #undef __P | 961 | #undef __P |
954 | 962 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
115 | #endif | 115 | #endif |
116 | 116 | ||
117 | /* | ||
118 | * The margin used when comparing utilization with CPU capacity: | ||
119 | * util * 1024 < capacity * margin | ||
120 | */ | ||
121 | unsigned int capacity_margin = 1280; /* ~20% */ | ||
122 | |||
117 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 123 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
118 | { | 124 | { |
119 | lw->weight += inc; | 125 | lw->weight += inc; |
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
256 | 262 | ||
257 | static inline struct task_struct *task_of(struct sched_entity *se) | 263 | static inline struct task_struct *task_of(struct sched_entity *se) |
258 | { | 264 | { |
259 | #ifdef CONFIG_SCHED_DEBUG | 265 | SCHED_WARN_ON(!entity_is_task(se)); |
260 | WARN_ON_ONCE(!entity_is_task(se)); | ||
261 | #endif | ||
262 | return container_of(se, struct task_struct, se); | 266 | return container_of(se, struct task_struct, se); |
263 | } | 267 | } |
264 | 268 | ||
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, | |||
456 | 460 | ||
457 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 461 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
458 | { | 462 | { |
463 | struct sched_entity *curr = cfs_rq->curr; | ||
464 | |||
459 | u64 vruntime = cfs_rq->min_vruntime; | 465 | u64 vruntime = cfs_rq->min_vruntime; |
460 | 466 | ||
461 | if (cfs_rq->curr) | 467 | if (curr) { |
462 | vruntime = cfs_rq->curr->vruntime; | 468 | if (curr->on_rq) |
469 | vruntime = curr->vruntime; | ||
470 | else | ||
471 | curr = NULL; | ||
472 | } | ||
463 | 473 | ||
464 | if (cfs_rq->rb_leftmost) { | 474 | if (cfs_rq->rb_leftmost) { |
465 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, | 475 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, |
466 | struct sched_entity, | 476 | struct sched_entity, |
467 | run_node); | 477 | run_node); |
468 | 478 | ||
469 | if (!cfs_rq->curr) | 479 | if (!curr) |
470 | vruntime = se->vruntime; | 480 | vruntime = se->vruntime; |
471 | else | 481 | else |
472 | vruntime = min_vruntime(vruntime, se->vruntime); | 482 | vruntime = min_vruntime(vruntime, se->vruntime); |
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
656 | } | 666 | } |
657 | 667 | ||
658 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
659 | static int select_idle_sibling(struct task_struct *p, int cpu); | 669 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
660 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
661 | 671 | ||
662 | /* | 672 | /* |
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
680 | * will definitely be update (after enqueue). | 690 | * will definitely be update (after enqueue). |
681 | */ | 691 | */ |
682 | sa->period_contrib = 1023; | 692 | sa->period_contrib = 1023; |
683 | sa->load_avg = scale_load_down(se->load.weight); | 693 | /* |
694 | * Tasks are intialized with full load to be seen as heavy tasks until | ||
695 | * they get a chance to stabilize to their real load level. | ||
696 | * Group entities are intialized with zero load to reflect the fact that | ||
697 | * nothing has been attached to the task group yet. | ||
698 | */ | ||
699 | if (entity_is_task(se)) | ||
700 | sa->load_avg = scale_load_down(se->load.weight); | ||
684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 701 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
685 | /* | 702 | /* |
686 | * At this point, util_avg won't be used in select_task_rq_fair anyway | 703 | * At this point, util_avg won't be used in select_task_rq_fair anyway |
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
726 | struct sched_avg *sa = &se->avg; | 743 | struct sched_avg *sa = &se->avg; |
727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 744 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
728 | u64 now = cfs_rq_clock_task(cfs_rq); | 745 | u64 now = cfs_rq_clock_task(cfs_rq); |
729 | int tg_update; | ||
730 | 746 | ||
731 | if (cap > 0) { | 747 | if (cap > 0) { |
732 | if (cfs_rq->avg.util_avg != 0) { | 748 | if (cfs_rq->avg.util_avg != 0) { |
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
759 | } | 775 | } |
760 | } | 776 | } |
761 | 777 | ||
762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 778 | update_cfs_rq_load_avg(now, cfs_rq, false); |
763 | attach_entity_load_avg(cfs_rq, se); | 779 | attach_entity_load_avg(cfs_rq, se); |
764 | if (tg_update) | 780 | update_tg_load_avg(cfs_rq, false); |
765 | update_tg_load_avg(cfs_rq, false); | ||
766 | } | 781 | } |
767 | 782 | ||
768 | #else /* !CONFIG_SMP */ | 783 | #else /* !CONFIG_SMP */ |
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
799 | max(delta_exec, curr->statistics.exec_max)); | 814 | max(delta_exec, curr->statistics.exec_max)); |
800 | 815 | ||
801 | curr->sum_exec_runtime += delta_exec; | 816 | curr->sum_exec_runtime += delta_exec; |
802 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 817 | schedstat_add(cfs_rq->exec_clock, delta_exec); |
803 | 818 | ||
804 | curr->vruntime += calc_delta_fair(delta_exec, curr); | 819 | curr->vruntime += calc_delta_fair(delta_exec, curr); |
805 | update_min_vruntime(cfs_rq); | 820 | update_min_vruntime(cfs_rq); |
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq) | |||
820 | update_curr(cfs_rq_of(&rq->curr->se)); | 835 | update_curr(cfs_rq_of(&rq->curr->se)); |
821 | } | 836 | } |
822 | 837 | ||
823 | #ifdef CONFIG_SCHEDSTATS | ||
824 | static inline void | 838 | static inline void |
825 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 839 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
826 | { | 840 | { |
827 | u64 wait_start = rq_clock(rq_of(cfs_rq)); | 841 | u64 wait_start, prev_wait_start; |
842 | |||
843 | if (!schedstat_enabled()) | ||
844 | return; | ||
845 | |||
846 | wait_start = rq_clock(rq_of(cfs_rq)); | ||
847 | prev_wait_start = schedstat_val(se->statistics.wait_start); | ||
828 | 848 | ||
829 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && | 849 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && |
830 | likely(wait_start > se->statistics.wait_start)) | 850 | likely(wait_start > prev_wait_start)) |
831 | wait_start -= se->statistics.wait_start; | 851 | wait_start -= prev_wait_start; |
832 | 852 | ||
833 | se->statistics.wait_start = wait_start; | 853 | schedstat_set(se->statistics.wait_start, wait_start); |
834 | } | 854 | } |
835 | 855 | ||
836 | static void | 856 | static inline void |
837 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 857 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
838 | { | 858 | { |
839 | struct task_struct *p; | 859 | struct task_struct *p; |
840 | u64 delta; | 860 | u64 delta; |
841 | 861 | ||
842 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 862 | if (!schedstat_enabled()) |
863 | return; | ||
864 | |||
865 | delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | ||
843 | 866 | ||
844 | if (entity_is_task(se)) { | 867 | if (entity_is_task(se)) { |
845 | p = task_of(se); | 868 | p = task_of(se); |
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
849 | * time stamp can be adjusted to accumulate wait time | 872 | * time stamp can be adjusted to accumulate wait time |
850 | * prior to migration. | 873 | * prior to migration. |
851 | */ | 874 | */ |
852 | se->statistics.wait_start = delta; | 875 | schedstat_set(se->statistics.wait_start, delta); |
853 | return; | 876 | return; |
854 | } | 877 | } |
855 | trace_sched_stat_wait(p, delta); | 878 | trace_sched_stat_wait(p, delta); |
856 | } | 879 | } |
857 | 880 | ||
858 | se->statistics.wait_max = max(se->statistics.wait_max, delta); | 881 | schedstat_set(se->statistics.wait_max, |
859 | se->statistics.wait_count++; | 882 | max(schedstat_val(se->statistics.wait_max), delta)); |
860 | se->statistics.wait_sum += delta; | 883 | schedstat_inc(se->statistics.wait_count); |
861 | se->statistics.wait_start = 0; | 884 | schedstat_add(se->statistics.wait_sum, delta); |
885 | schedstat_set(se->statistics.wait_start, 0); | ||
886 | } | ||
887 | |||
888 | static inline void | ||
889 | update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
890 | { | ||
891 | struct task_struct *tsk = NULL; | ||
892 | u64 sleep_start, block_start; | ||
893 | |||
894 | if (!schedstat_enabled()) | ||
895 | return; | ||
896 | |||
897 | sleep_start = schedstat_val(se->statistics.sleep_start); | ||
898 | block_start = schedstat_val(se->statistics.block_start); | ||
899 | |||
900 | if (entity_is_task(se)) | ||
901 | tsk = task_of(se); | ||
902 | |||
903 | if (sleep_start) { | ||
904 | u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; | ||
905 | |||
906 | if ((s64)delta < 0) | ||
907 | delta = 0; | ||
908 | |||
909 | if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) | ||
910 | schedstat_set(se->statistics.sleep_max, delta); | ||
911 | |||
912 | schedstat_set(se->statistics.sleep_start, 0); | ||
913 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
914 | |||
915 | if (tsk) { | ||
916 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
917 | trace_sched_stat_sleep(tsk, delta); | ||
918 | } | ||
919 | } | ||
920 | if (block_start) { | ||
921 | u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; | ||
922 | |||
923 | if ((s64)delta < 0) | ||
924 | delta = 0; | ||
925 | |||
926 | if (unlikely(delta > schedstat_val(se->statistics.block_max))) | ||
927 | schedstat_set(se->statistics.block_max, delta); | ||
928 | |||
929 | schedstat_set(se->statistics.block_start, 0); | ||
930 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
931 | |||
932 | if (tsk) { | ||
933 | if (tsk->in_iowait) { | ||
934 | schedstat_add(se->statistics.iowait_sum, delta); | ||
935 | schedstat_inc(se->statistics.iowait_count); | ||
936 | trace_sched_stat_iowait(tsk, delta); | ||
937 | } | ||
938 | |||
939 | trace_sched_stat_blocked(tsk, delta); | ||
940 | |||
941 | /* | ||
942 | * Blocking time is in units of nanosecs, so shift by | ||
943 | * 20 to get a milliseconds-range estimation of the | ||
944 | * amount of time that the task spent sleeping: | ||
945 | */ | ||
946 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
947 | profile_hits(SLEEP_PROFILING, | ||
948 | (void *)get_wchan(tsk), | ||
949 | delta >> 20); | ||
950 | } | ||
951 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
952 | } | ||
953 | } | ||
862 | } | 954 | } |
863 | 955 | ||
864 | /* | 956 | /* |
865 | * Task is being enqueued - update stats: | 957 | * Task is being enqueued - update stats: |
866 | */ | 958 | */ |
867 | static inline void | 959 | static inline void |
868 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 960 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
869 | { | 961 | { |
962 | if (!schedstat_enabled()) | ||
963 | return; | ||
964 | |||
870 | /* | 965 | /* |
871 | * Are we enqueueing a waiting task? (for current tasks | 966 | * Are we enqueueing a waiting task? (for current tasks |
872 | * a dequeue/enqueue event is a NOP) | 967 | * a dequeue/enqueue event is a NOP) |
873 | */ | 968 | */ |
874 | if (se != cfs_rq->curr) | 969 | if (se != cfs_rq->curr) |
875 | update_stats_wait_start(cfs_rq, se); | 970 | update_stats_wait_start(cfs_rq, se); |
971 | |||
972 | if (flags & ENQUEUE_WAKEUP) | ||
973 | update_stats_enqueue_sleeper(cfs_rq, se); | ||
876 | } | 974 | } |
877 | 975 | ||
878 | static inline void | 976 | static inline void |
879 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 977 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
880 | { | 978 | { |
979 | |||
980 | if (!schedstat_enabled()) | ||
981 | return; | ||
982 | |||
881 | /* | 983 | /* |
882 | * Mark the end of the wait period if dequeueing a | 984 | * Mark the end of the wait period if dequeueing a |
883 | * waiting task: | 985 | * waiting task: |
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
885 | if (se != cfs_rq->curr) | 987 | if (se != cfs_rq->curr) |
886 | update_stats_wait_end(cfs_rq, se); | 988 | update_stats_wait_end(cfs_rq, se); |
887 | 989 | ||
888 | if (flags & DEQUEUE_SLEEP) { | 990 | if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { |
889 | if (entity_is_task(se)) { | 991 | struct task_struct *tsk = task_of(se); |
890 | struct task_struct *tsk = task_of(se); | ||
891 | 992 | ||
892 | if (tsk->state & TASK_INTERRUPTIBLE) | 993 | if (tsk->state & TASK_INTERRUPTIBLE) |
893 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | 994 | schedstat_set(se->statistics.sleep_start, |
894 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 995 | rq_clock(rq_of(cfs_rq))); |
895 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | 996 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
896 | } | 997 | schedstat_set(se->statistics.block_start, |
998 | rq_clock(rq_of(cfs_rq))); | ||
897 | } | 999 | } |
898 | |||
899 | } | ||
900 | #else | ||
901 | static inline void | ||
902 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
903 | { | ||
904 | } | 1000 | } |
905 | 1001 | ||
906 | static inline void | ||
907 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
908 | { | ||
909 | } | ||
910 | |||
911 | static inline void | ||
912 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
913 | { | ||
914 | } | ||
915 | |||
916 | static inline void | ||
917 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
918 | { | ||
919 | } | ||
920 | #endif | ||
921 | |||
922 | /* | 1002 | /* |
923 | * We are picking a new current task - update its stats: | 1003 | * We are picking a new current task - update its stats: |
924 | */ | 1004 | */ |
@@ -1513,8 +1593,16 @@ balance: | |||
1513 | * One idle CPU per node is evaluated for a task numa move. | 1593 | * One idle CPU per node is evaluated for a task numa move. |
1514 | * Call select_idle_sibling to maybe find a better one. | 1594 | * Call select_idle_sibling to maybe find a better one. |
1515 | */ | 1595 | */ |
1516 | if (!cur) | 1596 | if (!cur) { |
1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1597 | /* |
1598 | * select_idle_siblings() uses an per-cpu cpumask that | ||
1599 | * can be used from IRQ context. | ||
1600 | */ | ||
1601 | local_irq_disable(); | ||
1602 | env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, | ||
1603 | env->dst_cpu); | ||
1604 | local_irq_enable(); | ||
1605 | } | ||
1518 | 1606 | ||
1519 | assign: | 1607 | assign: |
1520 | task_numa_assign(env, cur, imp); | 1608 | task_numa_assign(env, cur, imp); |
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work) | |||
2292 | unsigned long nr_pte_updates = 0; | 2380 | unsigned long nr_pte_updates = 0; |
2293 | long pages, virtpages; | 2381 | long pages, virtpages; |
2294 | 2382 | ||
2295 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2383 | SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); |
2296 | 2384 | ||
2297 | work->next = work; /* protect against double add */ | 2385 | work->next = work; /* protect against double add */ |
2298 | /* | 2386 | /* |
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2803 | } | 2891 | } |
2804 | 2892 | ||
2805 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2893 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2806 | /* | 2894 | /** |
2807 | * Updating tg's load_avg is necessary before update_cfs_share (which is done) | 2895 | * update_tg_load_avg - update the tg's load avg |
2808 | * and effective_load (which is not done because it is too costly). | 2896 | * @cfs_rq: the cfs_rq whose avg changed |
2897 | * @force: update regardless of how small the difference | ||
2898 | * | ||
2899 | * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. | ||
2900 | * However, because tg->load_avg is a global value there are performance | ||
2901 | * considerations. | ||
2902 | * | ||
2903 | * In order to avoid having to look at the other cfs_rq's, we use a | ||
2904 | * differential update where we store the last value we propagated. This in | ||
2905 | * turn allows skipping updates if the differential is 'small'. | ||
2906 | * | ||
2907 | * Updating tg's load_avg is necessary before update_cfs_share() (which is | ||
2908 | * done) and effective_load() (which is not done because it is too costly). | ||
2809 | */ | 2909 | */ |
2810 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 2910 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
2811 | { | 2911 | { |
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | |||
2875 | 2975 | ||
2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2976 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
2877 | { | 2977 | { |
2878 | struct rq *rq = rq_of(cfs_rq); | 2978 | if (&this_rq()->cfs == cfs_rq) { |
2879 | int cpu = cpu_of(rq); | ||
2880 | |||
2881 | if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { | ||
2882 | unsigned long max = rq->cpu_capacity_orig; | ||
2883 | |||
2884 | /* | 2979 | /* |
2885 | * There are a few boundary cases this might miss but it should | 2980 | * There are a few boundary cases this might miss but it should |
2886 | * get called often enough that that should (hopefully) not be | 2981 | * get called often enough that that should (hopefully) not be |
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2897 | * | 2992 | * |
2898 | * See cpu_util(). | 2993 | * See cpu_util(). |
2899 | */ | 2994 | */ |
2900 | cpufreq_update_util(rq_clock(rq), | 2995 | cpufreq_update_util(rq_of(cfs_rq), 0); |
2901 | min(cfs_rq->avg.util_avg, max), max); | ||
2902 | } | 2996 | } |
2903 | } | 2997 | } |
2904 | 2998 | ||
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2931 | * | 3025 | * |
2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | 3026 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. |
2933 | * | 3027 | * |
2934 | * Returns true if the load decayed or we removed utilization. It is expected | 3028 | * Returns true if the load decayed or we removed load. |
2935 | * that one calls update_tg_load_avg() on this condition, but after you've | 3029 | * |
2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | 3030 | * Since both these conditions indicate a changed cfs_rq->avg.load we should |
2937 | * avg up. | 3031 | * call update_tg_load_avg() when this function returns true. |
2938 | */ | 3032 | */ |
2939 | static inline int | 3033 | static inline int |
2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 3034 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3159 | 3253 | ||
3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3254 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
3161 | { | 3255 | { |
3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3256 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); |
3163 | struct rq *rq = rq_of(cfs_rq); | ||
3164 | |||
3165 | cpufreq_trigger_update(rq_clock(rq)); | ||
3166 | } | 3257 | } |
3167 | 3258 | ||
3168 | static inline void | 3259 | static inline void |
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq) | |||
3183 | 3274 | ||
3184 | #endif /* CONFIG_SMP */ | 3275 | #endif /* CONFIG_SMP */ |
3185 | 3276 | ||
3186 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3187 | { | ||
3188 | #ifdef CONFIG_SCHEDSTATS | ||
3189 | struct task_struct *tsk = NULL; | ||
3190 | |||
3191 | if (entity_is_task(se)) | ||
3192 | tsk = task_of(se); | ||
3193 | |||
3194 | if (se->statistics.sleep_start) { | ||
3195 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; | ||
3196 | |||
3197 | if ((s64)delta < 0) | ||
3198 | delta = 0; | ||
3199 | |||
3200 | if (unlikely(delta > se->statistics.sleep_max)) | ||
3201 | se->statistics.sleep_max = delta; | ||
3202 | |||
3203 | se->statistics.sleep_start = 0; | ||
3204 | se->statistics.sum_sleep_runtime += delta; | ||
3205 | |||
3206 | if (tsk) { | ||
3207 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
3208 | trace_sched_stat_sleep(tsk, delta); | ||
3209 | } | ||
3210 | } | ||
3211 | if (se->statistics.block_start) { | ||
3212 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; | ||
3213 | |||
3214 | if ((s64)delta < 0) | ||
3215 | delta = 0; | ||
3216 | |||
3217 | if (unlikely(delta > se->statistics.block_max)) | ||
3218 | se->statistics.block_max = delta; | ||
3219 | |||
3220 | se->statistics.block_start = 0; | ||
3221 | se->statistics.sum_sleep_runtime += delta; | ||
3222 | |||
3223 | if (tsk) { | ||
3224 | if (tsk->in_iowait) { | ||
3225 | se->statistics.iowait_sum += delta; | ||
3226 | se->statistics.iowait_count++; | ||
3227 | trace_sched_stat_iowait(tsk, delta); | ||
3228 | } | ||
3229 | |||
3230 | trace_sched_stat_blocked(tsk, delta); | ||
3231 | |||
3232 | /* | ||
3233 | * Blocking time is in units of nanosecs, so shift by | ||
3234 | * 20 to get a milliseconds-range estimation of the | ||
3235 | * amount of time that the task spent sleeping: | ||
3236 | */ | ||
3237 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
3238 | profile_hits(SLEEP_PROFILING, | ||
3239 | (void *)get_wchan(tsk), | ||
3240 | delta >> 20); | ||
3241 | } | ||
3242 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
3243 | } | ||
3244 | } | ||
3245 | #endif | ||
3246 | } | ||
3247 | |||
3248 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3277 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3249 | { | 3278 | { |
3250 | #ifdef CONFIG_SCHED_DEBUG | 3279 | #ifdef CONFIG_SCHED_DEBUG |
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3254 | d = -d; | 3283 | d = -d; |
3255 | 3284 | ||
3256 | if (d > 3*sysctl_sched_latency) | 3285 | if (d > 3*sysctl_sched_latency) |
3257 | schedstat_inc(cfs_rq, nr_spread_over); | 3286 | schedstat_inc(cfs_rq->nr_spread_over); |
3258 | #endif | 3287 | #endif |
3259 | } | 3288 | } |
3260 | 3289 | ||
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3371 | account_entity_enqueue(cfs_rq, se); | 3400 | account_entity_enqueue(cfs_rq, se); |
3372 | update_cfs_shares(cfs_rq); | 3401 | update_cfs_shares(cfs_rq); |
3373 | 3402 | ||
3374 | if (flags & ENQUEUE_WAKEUP) { | 3403 | if (flags & ENQUEUE_WAKEUP) |
3375 | place_entity(cfs_rq, se, 0); | 3404 | place_entity(cfs_rq, se, 0); |
3376 | if (schedstat_enabled()) | ||
3377 | enqueue_sleeper(cfs_rq, se); | ||
3378 | } | ||
3379 | 3405 | ||
3380 | check_schedstat_required(); | 3406 | check_schedstat_required(); |
3381 | if (schedstat_enabled()) { | 3407 | update_stats_enqueue(cfs_rq, se, flags); |
3382 | update_stats_enqueue(cfs_rq, se); | 3408 | check_spread(cfs_rq, se); |
3383 | check_spread(cfs_rq, se); | ||
3384 | } | ||
3385 | if (!curr) | 3409 | if (!curr) |
3386 | __enqueue_entity(cfs_rq, se); | 3410 | __enqueue_entity(cfs_rq, se); |
3387 | se->on_rq = 1; | 3411 | se->on_rq = 1; |
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3448 | update_curr(cfs_rq); | 3472 | update_curr(cfs_rq); |
3449 | dequeue_entity_load_avg(cfs_rq, se); | 3473 | dequeue_entity_load_avg(cfs_rq, se); |
3450 | 3474 | ||
3451 | if (schedstat_enabled()) | 3475 | update_stats_dequeue(cfs_rq, se, flags); |
3452 | update_stats_dequeue(cfs_rq, se, flags); | ||
3453 | 3476 | ||
3454 | clear_buddies(cfs_rq, se); | 3477 | clear_buddies(cfs_rq, se); |
3455 | 3478 | ||
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3459 | account_entity_dequeue(cfs_rq, se); | 3482 | account_entity_dequeue(cfs_rq, se); |
3460 | 3483 | ||
3461 | /* | 3484 | /* |
3462 | * Normalize the entity after updating the min_vruntime because the | 3485 | * Normalize after update_curr(); which will also have moved |
3463 | * update can refer to the ->curr item and we need to reflect this | 3486 | * min_vruntime if @se is the one holding it back. But before doing |
3464 | * movement in our normalized position. | 3487 | * update_min_vruntime() again, which will discount @se's position and |
3488 | * can move min_vruntime forward still more. | ||
3465 | */ | 3489 | */ |
3466 | if (!(flags & DEQUEUE_SLEEP)) | 3490 | if (!(flags & DEQUEUE_SLEEP)) |
3467 | se->vruntime -= cfs_rq->min_vruntime; | 3491 | se->vruntime -= cfs_rq->min_vruntime; |
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3469 | /* return excess runtime on last dequeue */ | 3493 | /* return excess runtime on last dequeue */ |
3470 | return_cfs_rq_runtime(cfs_rq); | 3494 | return_cfs_rq_runtime(cfs_rq); |
3471 | 3495 | ||
3472 | update_min_vruntime(cfs_rq); | ||
3473 | update_cfs_shares(cfs_rq); | 3496 | update_cfs_shares(cfs_rq); |
3497 | |||
3498 | /* | ||
3499 | * Now advance min_vruntime if @se was the entity holding it back, | ||
3500 | * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be | ||
3501 | * put back on, and if we advance min_vruntime, we'll be placed back | ||
3502 | * further than we started -- ie. we'll be penalized. | ||
3503 | */ | ||
3504 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
3505 | update_min_vruntime(cfs_rq); | ||
3474 | } | 3506 | } |
3475 | 3507 | ||
3476 | /* | 3508 | /* |
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3523 | * a CPU. So account for the time it spent waiting on the | 3555 | * a CPU. So account for the time it spent waiting on the |
3524 | * runqueue. | 3556 | * runqueue. |
3525 | */ | 3557 | */ |
3526 | if (schedstat_enabled()) | 3558 | update_stats_wait_end(cfs_rq, se); |
3527 | update_stats_wait_end(cfs_rq, se); | ||
3528 | __dequeue_entity(cfs_rq, se); | 3559 | __dequeue_entity(cfs_rq, se); |
3529 | update_load_avg(se, 1); | 3560 | update_load_avg(se, 1); |
3530 | } | 3561 | } |
3531 | 3562 | ||
3532 | update_stats_curr_start(cfs_rq, se); | 3563 | update_stats_curr_start(cfs_rq, se); |
3533 | cfs_rq->curr = se; | 3564 | cfs_rq->curr = se; |
3534 | #ifdef CONFIG_SCHEDSTATS | 3565 | |
3535 | /* | 3566 | /* |
3536 | * Track our maximum slice length, if the CPU's load is at | 3567 | * Track our maximum slice length, if the CPU's load is at |
3537 | * least twice that of our own weight (i.e. dont track it | 3568 | * least twice that of our own weight (i.e. dont track it |
3538 | * when there are only lesser-weight tasks around): | 3569 | * when there are only lesser-weight tasks around): |
3539 | */ | 3570 | */ |
3540 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3571 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
3541 | se->statistics.slice_max = max(se->statistics.slice_max, | 3572 | schedstat_set(se->statistics.slice_max, |
3542 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3573 | max((u64)schedstat_val(se->statistics.slice_max), |
3574 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | ||
3543 | } | 3575 | } |
3544 | #endif | 3576 | |
3545 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 3577 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
3546 | } | 3578 | } |
3547 | 3579 | ||
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3620 | /* throttle cfs_rqs exceeding runtime */ | 3652 | /* throttle cfs_rqs exceeding runtime */ |
3621 | check_cfs_rq_runtime(cfs_rq); | 3653 | check_cfs_rq_runtime(cfs_rq); |
3622 | 3654 | ||
3623 | if (schedstat_enabled()) { | 3655 | check_spread(cfs_rq, prev); |
3624 | check_spread(cfs_rq, prev); | ||
3625 | if (prev->on_rq) | ||
3626 | update_stats_wait_start(cfs_rq, prev); | ||
3627 | } | ||
3628 | 3656 | ||
3629 | if (prev->on_rq) { | 3657 | if (prev->on_rq) { |
3658 | update_stats_wait_start(cfs_rq, prev); | ||
3630 | /* Put 'current' back into the tree. */ | 3659 | /* Put 'current' back into the tree. */ |
3631 | __enqueue_entity(cfs_rq, prev); | 3660 | __enqueue_entity(cfs_rq, prev); |
3632 | /* in !on_rq case, update occurred at dequeue */ | 3661 | /* in !on_rq case, update occurred at dequeue */ |
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
4456 | struct sched_entity *se = &p->se; | 4485 | struct sched_entity *se = &p->se; |
4457 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 4486 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4458 | 4487 | ||
4459 | WARN_ON(task_rq(p) != rq); | 4488 | SCHED_WARN_ON(task_rq(p) != rq); |
4460 | 4489 | ||
4461 | if (cfs_rq->nr_running > 1) { | 4490 | if (rq->cfs.h_nr_running > 1) { |
4462 | u64 slice = sched_slice(cfs_rq, se); | 4491 | u64 slice = sched_slice(cfs_rq, se); |
4463 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 4492 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
4464 | s64 delta = slice - ran; | 4493 | s64 delta = slice - ran; |
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4509 | struct cfs_rq *cfs_rq; | 4538 | struct cfs_rq *cfs_rq; |
4510 | struct sched_entity *se = &p->se; | 4539 | struct sched_entity *se = &p->se; |
4511 | 4540 | ||
4541 | /* | ||
4542 | * If in_iowait is set, the code below may not trigger any cpufreq | ||
4543 | * utilization updates, so do it here explicitly with the IOWAIT flag | ||
4544 | * passed. | ||
4545 | */ | ||
4546 | if (p->in_iowait) | ||
4547 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); | ||
4548 | |||
4512 | for_each_sched_entity(se) { | 4549 | for_each_sched_entity(se) { |
4513 | if (se->on_rq) | 4550 | if (se->on_rq) |
4514 | break; | 4551 | break; |
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4605 | } | 4642 | } |
4606 | 4643 | ||
4607 | #ifdef CONFIG_SMP | 4644 | #ifdef CONFIG_SMP |
4645 | |||
4646 | /* Working cpumask for: load_balance, load_balance_newidle. */ | ||
4647 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
4648 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
4649 | |||
4608 | #ifdef CONFIG_NO_HZ_COMMON | 4650 | #ifdef CONFIG_NO_HZ_COMMON |
4609 | /* | 4651 | /* |
4610 | * per rq 'load' arrray crap; XXX kill this. | 4652 | * per rq 'load' arrray crap; XXX kill this. |
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
5006 | * wl = S * s'_i; see (2) | 5048 | * wl = S * s'_i; see (2) |
5007 | */ | 5049 | */ |
5008 | if (W > 0 && w < W) | 5050 | if (W > 0 && w < W) |
5009 | wl = (w * (long)tg->shares) / W; | 5051 | wl = (w * (long)scale_load_down(tg->shares)) / W; |
5010 | else | 5052 | else |
5011 | wl = tg->shares; | 5053 | wl = scale_load_down(tg->shares); |
5012 | 5054 | ||
5013 | /* | 5055 | /* |
5014 | * Per the above, wl is the new se->load.weight value; since | 5056 | * Per the above, wl is the new se->load.weight value; since |
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p) | |||
5091 | return 1; | 5133 | return 1; |
5092 | } | 5134 | } |
5093 | 5135 | ||
5094 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 5136 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5137 | int prev_cpu, int sync) | ||
5095 | { | 5138 | { |
5096 | s64 this_load, load; | 5139 | s64 this_load, load; |
5097 | s64 this_eff_load, prev_eff_load; | 5140 | s64 this_eff_load, prev_eff_load; |
5098 | int idx, this_cpu, prev_cpu; | 5141 | int idx, this_cpu; |
5099 | struct task_group *tg; | 5142 | struct task_group *tg; |
5100 | unsigned long weight; | 5143 | unsigned long weight; |
5101 | int balanced; | 5144 | int balanced; |
5102 | 5145 | ||
5103 | idx = sd->wake_idx; | 5146 | idx = sd->wake_idx; |
5104 | this_cpu = smp_processor_id(); | 5147 | this_cpu = smp_processor_id(); |
5105 | prev_cpu = task_cpu(p); | ||
5106 | load = source_load(prev_cpu, idx); | 5148 | load = source_load(prev_cpu, idx); |
5107 | this_load = target_load(this_cpu, idx); | 5149 | this_load = target_load(this_cpu, idx); |
5108 | 5150 | ||
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
5146 | 5188 | ||
5147 | balanced = this_eff_load <= prev_eff_load; | 5189 | balanced = this_eff_load <= prev_eff_load; |
5148 | 5190 | ||
5149 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 5191 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5150 | 5192 | ||
5151 | if (!balanced) | 5193 | if (!balanced) |
5152 | return 0; | 5194 | return 0; |
5153 | 5195 | ||
5154 | schedstat_inc(sd, ttwu_move_affine); | 5196 | schedstat_inc(sd->ttwu_move_affine); |
5155 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | 5197 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5156 | 5198 | ||
5157 | return 1; | 5199 | return 1; |
5158 | } | 5200 | } |
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5228 | int shallowest_idle_cpu = -1; | 5270 | int shallowest_idle_cpu = -1; |
5229 | int i; | 5271 | int i; |
5230 | 5272 | ||
5273 | /* Check if we have any choice: */ | ||
5274 | if (group->group_weight == 1) | ||
5275 | return cpumask_first(sched_group_cpus(group)); | ||
5276 | |||
5231 | /* Traverse only the allowed CPUs */ | 5277 | /* Traverse only the allowed CPUs */ |
5232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 5278 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
5233 | if (idle_cpu(i)) { | 5279 | if (idle_cpu(i)) { |
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5265 | } | 5311 | } |
5266 | 5312 | ||
5267 | /* | 5313 | /* |
5268 | * Try and locate an idle CPU in the sched_domain. | 5314 | * Implement a for_each_cpu() variant that starts the scan at a given cpu |
5315 | * (@start), and wraps around. | ||
5316 | * | ||
5317 | * This is used to scan for idle CPUs; such that not all CPUs looking for an | ||
5318 | * idle CPU find the same CPU. The down-side is that tasks tend to cycle | ||
5319 | * through the LLC domain. | ||
5320 | * | ||
5321 | * Especially tbench is found sensitive to this. | ||
5322 | */ | ||
5323 | |||
5324 | static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) | ||
5325 | { | ||
5326 | int next; | ||
5327 | |||
5328 | again: | ||
5329 | next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); | ||
5330 | |||
5331 | if (*wrapped) { | ||
5332 | if (next >= start) | ||
5333 | return nr_cpumask_bits; | ||
5334 | } else { | ||
5335 | if (next >= nr_cpumask_bits) { | ||
5336 | *wrapped = 1; | ||
5337 | n = -1; | ||
5338 | goto again; | ||
5339 | } | ||
5340 | } | ||
5341 | |||
5342 | return next; | ||
5343 | } | ||
5344 | |||
5345 | #define for_each_cpu_wrap(cpu, mask, start, wrap) \ | ||
5346 | for ((wrap) = 0, (cpu) = (start)-1; \ | ||
5347 | (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ | ||
5348 | (cpu) < nr_cpumask_bits; ) | ||
5349 | |||
5350 | #ifdef CONFIG_SCHED_SMT | ||
5351 | |||
5352 | static inline void set_idle_cores(int cpu, int val) | ||
5353 | { | ||
5354 | struct sched_domain_shared *sds; | ||
5355 | |||
5356 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5357 | if (sds) | ||
5358 | WRITE_ONCE(sds->has_idle_cores, val); | ||
5359 | } | ||
5360 | |||
5361 | static inline bool test_idle_cores(int cpu, bool def) | ||
5362 | { | ||
5363 | struct sched_domain_shared *sds; | ||
5364 | |||
5365 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5366 | if (sds) | ||
5367 | return READ_ONCE(sds->has_idle_cores); | ||
5368 | |||
5369 | return def; | ||
5370 | } | ||
5371 | |||
5372 | /* | ||
5373 | * Scans the local SMT mask to see if the entire core is idle, and records this | ||
5374 | * information in sd_llc_shared->has_idle_cores. | ||
5375 | * | ||
5376 | * Since SMT siblings share all cache levels, inspecting this limited remote | ||
5377 | * state should be fairly cheap. | ||
5378 | */ | ||
5379 | void __update_idle_core(struct rq *rq) | ||
5380 | { | ||
5381 | int core = cpu_of(rq); | ||
5382 | int cpu; | ||
5383 | |||
5384 | rcu_read_lock(); | ||
5385 | if (test_idle_cores(core, true)) | ||
5386 | goto unlock; | ||
5387 | |||
5388 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5389 | if (cpu == core) | ||
5390 | continue; | ||
5391 | |||
5392 | if (!idle_cpu(cpu)) | ||
5393 | goto unlock; | ||
5394 | } | ||
5395 | |||
5396 | set_idle_cores(core, 1); | ||
5397 | unlock: | ||
5398 | rcu_read_unlock(); | ||
5399 | } | ||
5400 | |||
5401 | /* | ||
5402 | * Scan the entire LLC domain for idle cores; this dynamically switches off if | ||
5403 | * there are no idle cores left in the system; tracked through | ||
5404 | * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. | ||
5405 | */ | ||
5406 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5407 | { | ||
5408 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); | ||
5409 | int core, cpu, wrap; | ||
5410 | |||
5411 | if (!static_branch_likely(&sched_smt_present)) | ||
5412 | return -1; | ||
5413 | |||
5414 | if (!test_idle_cores(target, false)) | ||
5415 | return -1; | ||
5416 | |||
5417 | cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); | ||
5418 | |||
5419 | for_each_cpu_wrap(core, cpus, target, wrap) { | ||
5420 | bool idle = true; | ||
5421 | |||
5422 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5423 | cpumask_clear_cpu(cpu, cpus); | ||
5424 | if (!idle_cpu(cpu)) | ||
5425 | idle = false; | ||
5426 | } | ||
5427 | |||
5428 | if (idle) | ||
5429 | return core; | ||
5430 | } | ||
5431 | |||
5432 | /* | ||
5433 | * Failed to find an idle core; stop looking for one. | ||
5434 | */ | ||
5435 | set_idle_cores(target, 0); | ||
5436 | |||
5437 | return -1; | ||
5438 | } | ||
5439 | |||
5440 | /* | ||
5441 | * Scan the local SMT mask for idle CPUs. | ||
5442 | */ | ||
5443 | static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
5444 | { | ||
5445 | int cpu; | ||
5446 | |||
5447 | if (!static_branch_likely(&sched_smt_present)) | ||
5448 | return -1; | ||
5449 | |||
5450 | for_each_cpu(cpu, cpu_smt_mask(target)) { | ||
5451 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5452 | continue; | ||
5453 | if (idle_cpu(cpu)) | ||
5454 | return cpu; | ||
5455 | } | ||
5456 | |||
5457 | return -1; | ||
5458 | } | ||
5459 | |||
5460 | #else /* CONFIG_SCHED_SMT */ | ||
5461 | |||
5462 | static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5463 | { | ||
5464 | return -1; | ||
5465 | } | ||
5466 | |||
5467 | static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
5468 | { | ||
5469 | return -1; | ||
5470 | } | ||
5471 | |||
5472 | #endif /* CONFIG_SCHED_SMT */ | ||
5473 | |||
5474 | /* | ||
5475 | * Scan the LLC domain for idle CPUs; this is dynamically regulated by | ||
5476 | * comparing the average scan cost (tracked in sd->avg_scan_cost) against the | ||
5477 | * average idle time for this rq (as found in rq->avg_idle). | ||
5478 | */ | ||
5479 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) | ||
5480 | { | ||
5481 | struct sched_domain *this_sd; | ||
5482 | u64 avg_cost, avg_idle = this_rq()->avg_idle; | ||
5483 | u64 time, cost; | ||
5484 | s64 delta; | ||
5485 | int cpu, wrap; | ||
5486 | |||
5487 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | ||
5488 | if (!this_sd) | ||
5489 | return -1; | ||
5490 | |||
5491 | avg_cost = this_sd->avg_scan_cost; | ||
5492 | |||
5493 | /* | ||
5494 | * Due to large variance we need a large fuzz factor; hackbench in | ||
5495 | * particularly is sensitive here. | ||
5496 | */ | ||
5497 | if ((avg_idle / 512) < avg_cost) | ||
5498 | return -1; | ||
5499 | |||
5500 | time = local_clock(); | ||
5501 | |||
5502 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { | ||
5503 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5504 | continue; | ||
5505 | if (idle_cpu(cpu)) | ||
5506 | break; | ||
5507 | } | ||
5508 | |||
5509 | time = local_clock() - time; | ||
5510 | cost = this_sd->avg_scan_cost; | ||
5511 | delta = (s64)(time - cost) / 8; | ||
5512 | this_sd->avg_scan_cost += delta; | ||
5513 | |||
5514 | return cpu; | ||
5515 | } | ||
5516 | |||
5517 | /* | ||
5518 | * Try and locate an idle core/thread in the LLC cache domain. | ||
5269 | */ | 5519 | */ |
5270 | static int select_idle_sibling(struct task_struct *p, int target) | 5520 | static int select_idle_sibling(struct task_struct *p, int prev, int target) |
5271 | { | 5521 | { |
5272 | struct sched_domain *sd; | 5522 | struct sched_domain *sd; |
5273 | struct sched_group *sg; | 5523 | int i; |
5274 | int i = task_cpu(p); | ||
5275 | 5524 | ||
5276 | if (idle_cpu(target)) | 5525 | if (idle_cpu(target)) |
5277 | return target; | 5526 | return target; |
5278 | 5527 | ||
5279 | /* | 5528 | /* |
5280 | * If the prevous cpu is cache affine and idle, don't be stupid. | 5529 | * If the previous cpu is cache affine and idle, don't be stupid. |
5281 | */ | 5530 | */ |
5282 | if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) | 5531 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
5283 | return i; | 5532 | return prev; |
5284 | 5533 | ||
5285 | /* | ||
5286 | * Otherwise, iterate the domains and find an eligible idle cpu. | ||
5287 | * | ||
5288 | * A completely idle sched group at higher domains is more | ||
5289 | * desirable than an idle group at a lower level, because lower | ||
5290 | * domains have smaller groups and usually share hardware | ||
5291 | * resources which causes tasks to contend on them, e.g. x86 | ||
5292 | * hyperthread siblings in the lowest domain (SMT) can contend | ||
5293 | * on the shared cpu pipeline. | ||
5294 | * | ||
5295 | * However, while we prefer idle groups at higher domains | ||
5296 | * finding an idle cpu at the lowest domain is still better than | ||
5297 | * returning 'target', which we've already established, isn't | ||
5298 | * idle. | ||
5299 | */ | ||
5300 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 5534 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
5301 | for_each_lower_domain(sd) { | 5535 | if (!sd) |
5302 | sg = sd->groups; | 5536 | return target; |
5303 | do { | 5537 | |
5304 | if (!cpumask_intersects(sched_group_cpus(sg), | 5538 | i = select_idle_core(p, sd, target); |
5305 | tsk_cpus_allowed(p))) | 5539 | if ((unsigned)i < nr_cpumask_bits) |
5306 | goto next; | 5540 | return i; |
5307 | 5541 | ||
5308 | /* Ensure the entire group is idle */ | 5542 | i = select_idle_cpu(p, sd, target); |
5309 | for_each_cpu(i, sched_group_cpus(sg)) { | 5543 | if ((unsigned)i < nr_cpumask_bits) |
5310 | if (i == target || !idle_cpu(i)) | 5544 | return i; |
5311 | goto next; | 5545 | |
5312 | } | 5546 | i = select_idle_smt(p, sd, target); |
5547 | if ((unsigned)i < nr_cpumask_bits) | ||
5548 | return i; | ||
5313 | 5549 | ||
5314 | /* | ||
5315 | * It doesn't matter which cpu we pick, the | ||
5316 | * whole group is idle. | ||
5317 | */ | ||
5318 | target = cpumask_first_and(sched_group_cpus(sg), | ||
5319 | tsk_cpus_allowed(p)); | ||
5320 | goto done; | ||
5321 | next: | ||
5322 | sg = sg->next; | ||
5323 | } while (sg != sd->groups); | ||
5324 | } | ||
5325 | done: | ||
5326 | return target; | 5550 | return target; |
5327 | } | 5551 | } |
5328 | 5552 | ||
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu) | |||
5360 | return (util >= capacity) ? capacity : util; | 5584 | return (util >= capacity) ? capacity : util; |
5361 | } | 5585 | } |
5362 | 5586 | ||
5587 | static inline int task_util(struct task_struct *p) | ||
5588 | { | ||
5589 | return p->se.avg.util_avg; | ||
5590 | } | ||
5591 | |||
5592 | /* | ||
5593 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the | ||
5594 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. | ||
5595 | * | ||
5596 | * In that case WAKE_AFFINE doesn't make sense and we'll let | ||
5597 | * BALANCE_WAKE sort things out. | ||
5598 | */ | ||
5599 | static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | ||
5600 | { | ||
5601 | long min_cap, max_cap; | ||
5602 | |||
5603 | min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); | ||
5604 | max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; | ||
5605 | |||
5606 | /* Minimum capacity is close to max, no need to abort wake_affine */ | ||
5607 | if (max_cap - min_cap < max_cap >> 3) | ||
5608 | return 0; | ||
5609 | |||
5610 | return min_cap * 1024 < task_util(p) * capacity_margin; | ||
5611 | } | ||
5612 | |||
5363 | /* | 5613 | /* |
5364 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 5614 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
5365 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 5615 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5383 | 5633 | ||
5384 | if (sd_flag & SD_BALANCE_WAKE) { | 5634 | if (sd_flag & SD_BALANCE_WAKE) { |
5385 | record_wakee(p); | 5635 | record_wakee(p); |
5386 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 5636 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) |
5637 | && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | ||
5387 | } | 5638 | } |
5388 | 5639 | ||
5389 | rcu_read_lock(); | 5640 | rcu_read_lock(); |
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5409 | 5660 | ||
5410 | if (affine_sd) { | 5661 | if (affine_sd) { |
5411 | sd = NULL; /* Prefer wake_affine over balance flags */ | 5662 | sd = NULL; /* Prefer wake_affine over balance flags */ |
5412 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 5663 | if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) |
5413 | new_cpu = cpu; | 5664 | new_cpu = cpu; |
5414 | } | 5665 | } |
5415 | 5666 | ||
5416 | if (!sd) { | 5667 | if (!sd) { |
5417 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 5668 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
5418 | new_cpu = select_idle_sibling(p, new_cpu); | 5669 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
5419 | 5670 | ||
5420 | } else while (sd) { | 5671 | } else while (sd) { |
5421 | struct sched_group *group; | 5672 | struct sched_group *group; |
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5939 | * | 6190 | * |
5940 | * The adjacency matrix of the resulting graph is given by: | 6191 | * The adjacency matrix of the resulting graph is given by: |
5941 | * | 6192 | * |
5942 | * log_2 n | 6193 | * log_2 n |
5943 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | 6194 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) |
5944 | * k = 0 | 6195 | * k = 0 |
5945 | * | 6196 | * |
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5985 | * | 6236 | * |
5986 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | 6237 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that |
5987 | * rewrite all of this once again.] | 6238 | * rewrite all of this once again.] |
5988 | */ | 6239 | */ |
5989 | 6240 | ||
5990 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 6241 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
5991 | 6242 | ||
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6133 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 6384 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
6134 | int cpu; | 6385 | int cpu; |
6135 | 6386 | ||
6136 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 6387 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
6137 | 6388 | ||
6138 | env->flags |= LBF_SOME_PINNED; | 6389 | env->flags |= LBF_SOME_PINNED; |
6139 | 6390 | ||
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6164 | env->flags &= ~LBF_ALL_PINNED; | 6415 | env->flags &= ~LBF_ALL_PINNED; |
6165 | 6416 | ||
6166 | if (task_running(env->src_rq, p)) { | 6417 | if (task_running(env->src_rq, p)) { |
6167 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 6418 | schedstat_inc(p->se.statistics.nr_failed_migrations_running); |
6168 | return 0; | 6419 | return 0; |
6169 | } | 6420 | } |
6170 | 6421 | ||
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6181 | if (tsk_cache_hot <= 0 || | 6432 | if (tsk_cache_hot <= 0 || |
6182 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 6433 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
6183 | if (tsk_cache_hot == 1) { | 6434 | if (tsk_cache_hot == 1) { |
6184 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 6435 | schedstat_inc(env->sd->lb_hot_gained[env->idle]); |
6185 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 6436 | schedstat_inc(p->se.statistics.nr_forced_migrations); |
6186 | } | 6437 | } |
6187 | return 1; | 6438 | return 1; |
6188 | } | 6439 | } |
6189 | 6440 | ||
6190 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 6441 | schedstat_inc(p->se.statistics.nr_failed_migrations_hot); |
6191 | return 0; | 6442 | return 0; |
6192 | } | 6443 | } |
6193 | 6444 | ||
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
6227 | * so we can safely collect stats here rather than | 6478 | * so we can safely collect stats here rather than |
6228 | * inside detach_tasks(). | 6479 | * inside detach_tasks(). |
6229 | */ | 6480 | */ |
6230 | schedstat_inc(env->sd, lb_gained[env->idle]); | 6481 | schedstat_inc(env->sd->lb_gained[env->idle]); |
6231 | return p; | 6482 | return p; |
6232 | } | 6483 | } |
6233 | return NULL; | 6484 | return NULL; |
@@ -6319,7 +6570,7 @@ next: | |||
6319 | * so we can safely collect detach_one_task() stats here rather | 6570 | * so we can safely collect detach_one_task() stats here rather |
6320 | * than inside detach_one_task(). | 6571 | * than inside detach_one_task(). |
6321 | */ | 6572 | */ |
6322 | schedstat_add(env->sd, lb_gained[env->idle], detached); | 6573 | schedstat_add(env->sd->lb_gained[env->idle], detached); |
6323 | 6574 | ||
6324 | return detached; | 6575 | return detached; |
6325 | } | 6576 | } |
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6647 | /* | 6898 | /* |
6648 | * !SD_OVERLAP domains can assume that child groups | 6899 | * !SD_OVERLAP domains can assume that child groups |
6649 | * span the current group. | 6900 | * span the current group. |
6650 | */ | 6901 | */ |
6651 | 6902 | ||
6652 | group = child->groups; | 6903 | group = child->groups; |
6653 | do { | 6904 | do { |
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
7147 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; | 7398 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; |
7148 | if (load_above_capacity > busiest->group_capacity) { | 7399 | if (load_above_capacity > busiest->group_capacity) { |
7149 | load_above_capacity -= busiest->group_capacity; | 7400 | load_above_capacity -= busiest->group_capacity; |
7150 | load_above_capacity *= NICE_0_LOAD; | 7401 | load_above_capacity *= scale_load_down(NICE_0_LOAD); |
7151 | load_above_capacity /= busiest->group_capacity; | 7402 | load_above_capacity /= busiest->group_capacity; |
7152 | } else | 7403 | } else |
7153 | load_above_capacity = ~0UL; | 7404 | load_above_capacity = ~0UL; |
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
7354 | */ | 7605 | */ |
7355 | #define MAX_PINNED_INTERVAL 512 | 7606 | #define MAX_PINNED_INTERVAL 512 |
7356 | 7607 | ||
7357 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
7358 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
7359 | |||
7360 | static int need_active_balance(struct lb_env *env) | 7608 | static int need_active_balance(struct lb_env *env) |
7361 | { | 7609 | { |
7362 | struct sched_domain *sd = env->sd; | 7610 | struct sched_domain *sd = env->sd; |
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
7460 | 7708 | ||
7461 | cpumask_copy(cpus, cpu_active_mask); | 7709 | cpumask_copy(cpus, cpu_active_mask); |
7462 | 7710 | ||
7463 | schedstat_inc(sd, lb_count[idle]); | 7711 | schedstat_inc(sd->lb_count[idle]); |
7464 | 7712 | ||
7465 | redo: | 7713 | redo: |
7466 | if (!should_we_balance(&env)) { | 7714 | if (!should_we_balance(&env)) { |
@@ -7470,19 +7718,19 @@ redo: | |||
7470 | 7718 | ||
7471 | group = find_busiest_group(&env); | 7719 | group = find_busiest_group(&env); |
7472 | if (!group) { | 7720 | if (!group) { |
7473 | schedstat_inc(sd, lb_nobusyg[idle]); | 7721 | schedstat_inc(sd->lb_nobusyg[idle]); |
7474 | goto out_balanced; | 7722 | goto out_balanced; |
7475 | } | 7723 | } |
7476 | 7724 | ||
7477 | busiest = find_busiest_queue(&env, group); | 7725 | busiest = find_busiest_queue(&env, group); |
7478 | if (!busiest) { | 7726 | if (!busiest) { |
7479 | schedstat_inc(sd, lb_nobusyq[idle]); | 7727 | schedstat_inc(sd->lb_nobusyq[idle]); |
7480 | goto out_balanced; | 7728 | goto out_balanced; |
7481 | } | 7729 | } |
7482 | 7730 | ||
7483 | BUG_ON(busiest == env.dst_rq); | 7731 | BUG_ON(busiest == env.dst_rq); |
7484 | 7732 | ||
7485 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 7733 | schedstat_add(sd->lb_imbalance[idle], env.imbalance); |
7486 | 7734 | ||
7487 | env.src_cpu = busiest->cpu; | 7735 | env.src_cpu = busiest->cpu; |
7488 | env.src_rq = busiest; | 7736 | env.src_rq = busiest; |
@@ -7589,7 +7837,7 @@ more_balance: | |||
7589 | } | 7837 | } |
7590 | 7838 | ||
7591 | if (!ld_moved) { | 7839 | if (!ld_moved) { |
7592 | schedstat_inc(sd, lb_failed[idle]); | 7840 | schedstat_inc(sd->lb_failed[idle]); |
7593 | /* | 7841 | /* |
7594 | * Increment the failure counter only on periodic balance. | 7842 | * Increment the failure counter only on periodic balance. |
7595 | * We do not want newidle balance, which can be very | 7843 | * We do not want newidle balance, which can be very |
@@ -7672,7 +7920,7 @@ out_all_pinned: | |||
7672 | * we can't migrate them. Let the imbalance flag set so parent level | 7920 | * we can't migrate them. Let the imbalance flag set so parent level |
7673 | * can try to migrate them. | 7921 | * can try to migrate them. |
7674 | */ | 7922 | */ |
7675 | schedstat_inc(sd, lb_balanced[idle]); | 7923 | schedstat_inc(sd->lb_balanced[idle]); |
7676 | 7924 | ||
7677 | sd->nr_balance_failed = 0; | 7925 | sd->nr_balance_failed = 0; |
7678 | 7926 | ||
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | |||
7704 | } | 7952 | } |
7705 | 7953 | ||
7706 | static inline void | 7954 | static inline void |
7707 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | 7955 | update_next_balance(struct sched_domain *sd, unsigned long *next_balance) |
7708 | { | 7956 | { |
7709 | unsigned long interval, next; | 7957 | unsigned long interval, next; |
7710 | 7958 | ||
7711 | interval = get_sd_balance_interval(sd, cpu_busy); | 7959 | /* used by idle balance, so cpu_busy = 0 */ |
7960 | interval = get_sd_balance_interval(sd, 0); | ||
7712 | next = sd->last_balance + interval; | 7961 | next = sd->last_balance + interval; |
7713 | 7962 | ||
7714 | if (time_after(*next_balance, next)) | 7963 | if (time_after(*next_balance, next)) |
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq) | |||
7738 | rcu_read_lock(); | 7987 | rcu_read_lock(); |
7739 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 7988 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
7740 | if (sd) | 7989 | if (sd) |
7741 | update_next_balance(sd, 0, &next_balance); | 7990 | update_next_balance(sd, &next_balance); |
7742 | rcu_read_unlock(); | 7991 | rcu_read_unlock(); |
7743 | 7992 | ||
7744 | goto out; | 7993 | goto out; |
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq) | |||
7756 | continue; | 8005 | continue; |
7757 | 8006 | ||
7758 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | 8007 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
7759 | update_next_balance(sd, 0, &next_balance); | 8008 | update_next_balance(sd, &next_balance); |
7760 | break; | 8009 | break; |
7761 | } | 8010 | } |
7762 | 8011 | ||
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq) | |||
7774 | curr_cost += domain_cost; | 8023 | curr_cost += domain_cost; |
7775 | } | 8024 | } |
7776 | 8025 | ||
7777 | update_next_balance(sd, 0, &next_balance); | 8026 | update_next_balance(sd, &next_balance); |
7778 | 8027 | ||
7779 | /* | 8028 | /* |
7780 | * Stop searching for tasks to pull if there are | 8029 | * Stop searching for tasks to pull if there are |
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data) | |||
7864 | .idle = CPU_IDLE, | 8113 | .idle = CPU_IDLE, |
7865 | }; | 8114 | }; |
7866 | 8115 | ||
7867 | schedstat_inc(sd, alb_count); | 8116 | schedstat_inc(sd->alb_count); |
7868 | 8117 | ||
7869 | p = detach_one_task(&env); | 8118 | p = detach_one_task(&env); |
7870 | if (p) { | 8119 | if (p) { |
7871 | schedstat_inc(sd, alb_pushed); | 8120 | schedstat_inc(sd->alb_pushed); |
7872 | /* Active balancing done, reset the failure counter. */ | 8121 | /* Active balancing done, reset the failure counter. */ |
7873 | sd->nr_balance_failed = 0; | 8122 | sd->nr_balance_failed = 0; |
7874 | } else { | 8123 | } else { |
7875 | schedstat_inc(sd, alb_failed); | 8124 | schedstat_inc(sd->alb_failed); |
7876 | } | 8125 | } |
7877 | } | 8126 | } |
7878 | rcu_read_unlock(); | 8127 | rcu_read_unlock(); |
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void) | |||
7964 | int cpu = smp_processor_id(); | 8213 | int cpu = smp_processor_id(); |
7965 | 8214 | ||
7966 | rcu_read_lock(); | 8215 | rcu_read_lock(); |
7967 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8216 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7968 | 8217 | ||
7969 | if (!sd || !sd->nohz_idle) | 8218 | if (!sd || !sd->nohz_idle) |
7970 | goto unlock; | 8219 | goto unlock; |
7971 | sd->nohz_idle = 0; | 8220 | sd->nohz_idle = 0; |
7972 | 8221 | ||
7973 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); | 8222 | atomic_inc(&sd->shared->nr_busy_cpus); |
7974 | unlock: | 8223 | unlock: |
7975 | rcu_read_unlock(); | 8224 | rcu_read_unlock(); |
7976 | } | 8225 | } |
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void) | |||
7981 | int cpu = smp_processor_id(); | 8230 | int cpu = smp_processor_id(); |
7982 | 8231 | ||
7983 | rcu_read_lock(); | 8232 | rcu_read_lock(); |
7984 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8233 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7985 | 8234 | ||
7986 | if (!sd || sd->nohz_idle) | 8235 | if (!sd || sd->nohz_idle) |
7987 | goto unlock; | 8236 | goto unlock; |
7988 | sd->nohz_idle = 1; | 8237 | sd->nohz_idle = 1; |
7989 | 8238 | ||
7990 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); | 8239 | atomic_dec(&sd->shared->nr_busy_cpus); |
7991 | unlock: | 8240 | unlock: |
7992 | rcu_read_unlock(); | 8241 | rcu_read_unlock(); |
7993 | } | 8242 | } |
@@ -8214,8 +8463,8 @@ end: | |||
8214 | static inline bool nohz_kick_needed(struct rq *rq) | 8463 | static inline bool nohz_kick_needed(struct rq *rq) |
8215 | { | 8464 | { |
8216 | unsigned long now = jiffies; | 8465 | unsigned long now = jiffies; |
8466 | struct sched_domain_shared *sds; | ||
8217 | struct sched_domain *sd; | 8467 | struct sched_domain *sd; |
8218 | struct sched_group_capacity *sgc; | ||
8219 | int nr_busy, cpu = rq->cpu; | 8468 | int nr_busy, cpu = rq->cpu; |
8220 | bool kick = false; | 8469 | bool kick = false; |
8221 | 8470 | ||
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
8243 | return true; | 8492 | return true; |
8244 | 8493 | ||
8245 | rcu_read_lock(); | 8494 | rcu_read_lock(); |
8246 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8495 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
8247 | if (sd) { | 8496 | if (sds) { |
8248 | sgc = sd->groups->sgc; | 8497 | /* |
8249 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 8498 | * XXX: write a coherent comment on why we do this. |
8250 | 8499 | * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com | |
8500 | */ | ||
8501 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
8251 | if (nr_busy > 1) { | 8502 | if (nr_busy > 1) { |
8252 | kick = true; | 8503 | kick = true; |
8253 | goto unlock; | 8504 | goto unlock; |
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | |||
8283 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 8534 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
8284 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | 8535 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). |
8285 | */ | 8536 | */ |
8286 | static void run_rebalance_domains(struct softirq_action *h) | 8537 | static __latent_entropy void run_rebalance_domains(struct softirq_action *h) |
8287 | { | 8538 | { |
8288 | struct rq *this_rq = this_rq(); | 8539 | struct rq *this_rq = this_rq(); |
8289 | enum cpu_idle_type idle = this_rq->idle_balance ? | 8540 | enum cpu_idle_type idle = this_rq->idle_balance ? |
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8441 | struct sched_entity *se = &p->se; | 8692 | struct sched_entity *se = &p->se; |
8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8693 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8443 | u64 now = cfs_rq_clock_task(cfs_rq); | 8694 | u64 now = cfs_rq_clock_task(cfs_rq); |
8444 | int tg_update; | ||
8445 | 8695 | ||
8446 | if (!vruntime_normalized(p)) { | 8696 | if (!vruntime_normalized(p)) { |
8447 | /* | 8697 | /* |
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8453 | } | 8703 | } |
8454 | 8704 | ||
8455 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8705 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8706 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8457 | detach_entity_load_avg(cfs_rq, se); | 8707 | detach_entity_load_avg(cfs_rq, se); |
8458 | if (tg_update) | 8708 | update_tg_load_avg(cfs_rq, false); |
8459 | update_tg_load_avg(cfs_rq, false); | ||
8460 | } | 8709 | } |
8461 | 8710 | ||
8462 | static void attach_task_cfs_rq(struct task_struct *p) | 8711 | static void attach_task_cfs_rq(struct task_struct *p) |
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8464 | struct sched_entity *se = &p->se; | 8713 | struct sched_entity *se = &p->se; |
8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8714 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8466 | u64 now = cfs_rq_clock_task(cfs_rq); | 8715 | u64 now = cfs_rq_clock_task(cfs_rq); |
8467 | int tg_update; | ||
8468 | 8716 | ||
8469 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8717 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8470 | /* | 8718 | /* |
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8475 | #endif | 8723 | #endif |
8476 | 8724 | ||
8477 | /* Synchronize task with its cfs_rq */ | 8725 | /* Synchronize task with its cfs_rq */ |
8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8726 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8479 | attach_entity_load_avg(cfs_rq, se); | 8727 | attach_entity_load_avg(cfs_rq, se); |
8480 | if (tg_update) | 8728 | update_tg_load_avg(cfs_rq, false); |
8481 | update_tg_load_avg(cfs_rq, false); | ||
8482 | 8729 | ||
8483 | if (!vruntime_normalized(p)) | 8730 | if (!vruntime_normalized(p)) |
8484 | se->vruntime += cfs_rq->min_vruntime; | 8731 | se->vruntime += cfs_rq->min_vruntime; |
@@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8592 | { | 8839 | { |
8593 | struct sched_entity *se; | 8840 | struct sched_entity *se; |
8594 | struct cfs_rq *cfs_rq; | 8841 | struct cfs_rq *cfs_rq; |
8595 | struct rq *rq; | ||
8596 | int i; | 8842 | int i; |
8597 | 8843 | ||
8598 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8844 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8607 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | 8853 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); |
8608 | 8854 | ||
8609 | for_each_possible_cpu(i) { | 8855 | for_each_possible_cpu(i) { |
8610 | rq = cpu_rq(i); | ||
8611 | |||
8612 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8856 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8613 | GFP_KERNEL, cpu_to_node(i)); | 8857 | GFP_KERNEL, cpu_to_node(i)); |
8614 | if (!cfs_rq) | 8858 | if (!cfs_rq) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -16,6 +16,9 @@ | |||
16 | 16 | ||
17 | #include "sched.h" | 17 | #include "sched.h" |
18 | 18 | ||
19 | /* Linker adds these: start and end of __cpuidle functions */ | ||
20 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | ||
21 | |||
19 | /** | 22 | /** |
20 | * sched_idle_set_state - Record idle state for the current CPU. | 23 | * sched_idle_set_state - Record idle state for the current CPU. |
21 | * @idle_state: State to record. | 24 | * @idle_state: State to record. |
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) | |||
53 | __setup("hlt", cpu_idle_nopoll_setup); | 56 | __setup("hlt", cpu_idle_nopoll_setup); |
54 | #endif | 57 | #endif |
55 | 58 | ||
56 | static inline int cpu_idle_poll(void) | 59 | static noinline int __cpuidle cpu_idle_poll(void) |
57 | { | 60 | { |
58 | rcu_idle_enter(); | 61 | rcu_idle_enter(); |
59 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 62 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) | |||
84 | * | 87 | * |
85 | * To use when the cpuidle framework cannot be used. | 88 | * To use when the cpuidle framework cannot be used. |
86 | */ | 89 | */ |
87 | void default_idle_call(void) | 90 | void __cpuidle default_idle_call(void) |
88 | { | 91 | { |
89 | if (current_clr_polling_and_test()) { | 92 | if (current_clr_polling_and_test()) { |
90 | local_irq_enable(); | 93 | local_irq_enable(); |
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void) | |||
271 | } | 274 | } |
272 | } | 275 | } |
273 | 276 | ||
277 | bool cpu_in_idle(unsigned long pc) | ||
278 | { | ||
279 | return pc >= (unsigned long)__cpuidle_text_start && | ||
280 | pc < (unsigned long)__cpuidle_text_end; | ||
281 | } | ||
282 | |||
274 | void cpu_startup_entry(enum cpuhp_state state) | 283 | void cpu_startup_entry(enum cpuhp_state state) |
275 | { | 284 | { |
276 | /* | 285 | /* |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -27,8 +27,8 @@ static struct task_struct * | |||
27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) |
28 | { | 28 | { |
29 | put_prev_task(rq, prev); | 29 | put_prev_task(rq, prev); |
30 | 30 | update_idle_core(rq); | |
31 | schedstat_inc(rq, sched_goidle); | 31 | schedstat_inc(rq->sched_goidle); |
32 | return rq->idle; | 32 | return rq->idle; |
33 | } | 33 | } |
34 | 34 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5690b722691..2516b8df6dbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq) | |||
957 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
958 | return; | 958 | return; |
959 | 959 | ||
960 | /* Kick cpufreq (see the comment in linux/cpufreq.h). */ | 960 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ |
961 | if (cpu_of(rq) == smp_processor_id()) | 961 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); |
962 | cpufreq_trigger_update(rq_clock(rq)); | ||
963 | 962 | ||
964 | schedstat_set(curr->se.statistics.exec_max, | 963 | schedstat_set(curr->se.statistics.exec_max, |
965 | max(curr->se.statistics.exec_max, delta_exec)); | 964 | max(curr->se.statistics.exec_max, delta_exec)); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..055f935d4421 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/u64_stats_sync.h> | ||
5 | #include <linux/sched/deadline.h> | 6 | #include <linux/sched/deadline.h> |
6 | #include <linux/binfmts.h> | 7 | #include <linux/binfmts.h> |
7 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
@@ -15,6 +16,12 @@ | |||
15 | #include "cpudeadline.h" | 16 | #include "cpudeadline.h" |
16 | #include "cpuacct.h" | 17 | #include "cpuacct.h" |
17 | 18 | ||
19 | #ifdef CONFIG_SCHED_DEBUG | ||
20 | #define SCHED_WARN_ON(x) WARN_ONCE(x, #x) | ||
21 | #else | ||
22 | #define SCHED_WARN_ON(x) ((void)(x)) | ||
23 | #endif | ||
24 | |||
18 | struct rq; | 25 | struct rq; |
19 | struct cpuidle_state; | 26 | struct cpuidle_state; |
20 | 27 | ||
@@ -565,6 +572,8 @@ struct root_domain { | |||
565 | */ | 572 | */ |
566 | cpumask_var_t rto_mask; | 573 | cpumask_var_t rto_mask; |
567 | struct cpupri cpupri; | 574 | struct cpupri cpupri; |
575 | |||
576 | unsigned long max_cpu_capacity; | ||
568 | }; | 577 | }; |
569 | 578 | ||
570 | extern struct root_domain def_root_domain; | 579 | extern struct root_domain def_root_domain; |
@@ -597,7 +606,6 @@ struct rq { | |||
597 | #ifdef CONFIG_SMP | 606 | #ifdef CONFIG_SMP |
598 | unsigned long last_load_update_tick; | 607 | unsigned long last_load_update_tick; |
599 | #endif /* CONFIG_SMP */ | 608 | #endif /* CONFIG_SMP */ |
600 | u64 nohz_stamp; | ||
601 | unsigned long nohz_flags; | 609 | unsigned long nohz_flags; |
602 | #endif /* CONFIG_NO_HZ_COMMON */ | 610 | #endif /* CONFIG_NO_HZ_COMMON */ |
603 | #ifdef CONFIG_NO_HZ_FULL | 611 | #ifdef CONFIG_NO_HZ_FULL |
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq) | |||
723 | #endif | 731 | #endif |
724 | } | 732 | } |
725 | 733 | ||
734 | |||
735 | #ifdef CONFIG_SCHED_SMT | ||
736 | |||
737 | extern struct static_key_false sched_smt_present; | ||
738 | |||
739 | extern void __update_idle_core(struct rq *rq); | ||
740 | |||
741 | static inline void update_idle_core(struct rq *rq) | ||
742 | { | ||
743 | if (static_branch_unlikely(&sched_smt_present)) | ||
744 | __update_idle_core(rq); | ||
745 | } | ||
746 | |||
747 | #else | ||
748 | static inline void update_idle_core(struct rq *rq) { } | ||
749 | #endif | ||
750 | |||
726 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 751 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
727 | 752 | ||
728 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 753 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
857 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 882 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
858 | DECLARE_PER_CPU(int, sd_llc_size); | 883 | DECLARE_PER_CPU(int, sd_llc_size); |
859 | DECLARE_PER_CPU(int, sd_llc_id); | 884 | DECLARE_PER_CPU(int, sd_llc_id); |
885 | DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
860 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | 886 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); |
861 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
862 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 887 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
863 | 888 | ||
864 | struct sched_group_capacity { | 889 | struct sched_group_capacity { |
@@ -870,10 +895,6 @@ struct sched_group_capacity { | |||
870 | unsigned int capacity; | 895 | unsigned int capacity; |
871 | unsigned long next_update; | 896 | unsigned long next_update; |
872 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 897 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
873 | /* | ||
874 | * Number of busy cpus in this group. | ||
875 | */ | ||
876 | atomic_t nr_busy_cpus; | ||
877 | 898 | ||
878 | unsigned long cpumask[0]; /* iteration mask */ | 899 | unsigned long cpumask[0]; /* iteration mask */ |
879 | }; | 900 | }; |
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1000 | * per-task data have been completed by this moment. | 1021 | * per-task data have been completed by this moment. |
1001 | */ | 1022 | */ |
1002 | smp_wmb(); | 1023 | smp_wmb(); |
1024 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
1025 | p->cpu = cpu; | ||
1026 | #else | ||
1003 | task_thread_info(p)->cpu = cpu; | 1027 | task_thread_info(p)->cpu = cpu; |
1028 | #endif | ||
1004 | p->wake_cpu = cpu; | 1029 | p->wake_cpu = cpu; |
1005 | #endif | 1030 | #endif |
1006 | } | 1031 | } |
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
1260 | prev->sched_class->put_prev_task(rq, prev); | 1285 | prev->sched_class->put_prev_task(rq, prev); |
1261 | } | 1286 | } |
1262 | 1287 | ||
1288 | static inline void set_curr_task(struct rq *rq, struct task_struct *curr) | ||
1289 | { | ||
1290 | curr->sched_class->set_curr_task(rq); | ||
1291 | } | ||
1292 | |||
1263 | #define sched_class_highest (&stop_sched_class) | 1293 | #define sched_class_highest (&stop_sched_class) |
1264 | #define for_each_class(class) \ | 1294 | #define for_each_class(class) \ |
1265 | for (class = sched_class_highest; class; class = class->next) | 1295 | for (class = sched_class_highest; class; class = class->next) |
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq, | |||
1290 | 1320 | ||
1291 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1321 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
1292 | { | 1322 | { |
1293 | WARN_ON(!rcu_read_lock_held()); | 1323 | SCHED_WARN_ON(!rcu_read_lock_held()); |
1294 | return rq->idle_state; | 1324 | return rq->idle_state; |
1295 | } | 1325 | } |
1296 | #else | 1326 | #else |
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
1710 | #endif | 1740 | #endif |
1711 | 1741 | ||
1712 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1742 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1743 | struct irqtime { | ||
1744 | u64 hardirq_time; | ||
1745 | u64 softirq_time; | ||
1746 | u64 irq_start_time; | ||
1747 | struct u64_stats_sync sync; | ||
1748 | }; | ||
1713 | 1749 | ||
1714 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | 1750 | DECLARE_PER_CPU(struct irqtime, cpu_irqtime); |
1715 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
1716 | |||
1717 | #ifndef CONFIG_64BIT | ||
1718 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
1719 | |||
1720 | static inline void irq_time_write_begin(void) | ||
1721 | { | ||
1722 | __this_cpu_inc(irq_time_seq.sequence); | ||
1723 | smp_wmb(); | ||
1724 | } | ||
1725 | |||
1726 | static inline void irq_time_write_end(void) | ||
1727 | { | ||
1728 | smp_wmb(); | ||
1729 | __this_cpu_inc(irq_time_seq.sequence); | ||
1730 | } | ||
1731 | 1751 | ||
1732 | static inline u64 irq_time_read(int cpu) | 1752 | static inline u64 irq_time_read(int cpu) |
1733 | { | 1753 | { |
1734 | u64 irq_time; | 1754 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
1735 | unsigned seq; | 1755 | unsigned int seq; |
1756 | u64 total; | ||
1736 | 1757 | ||
1737 | do { | 1758 | do { |
1738 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | 1759 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
1739 | irq_time = per_cpu(cpu_softirq_time, cpu) + | 1760 | total = irqtime->softirq_time + irqtime->hardirq_time; |
1740 | per_cpu(cpu_hardirq_time, cpu); | 1761 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
1741 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1742 | |||
1743 | return irq_time; | ||
1744 | } | ||
1745 | #else /* CONFIG_64BIT */ | ||
1746 | static inline void irq_time_write_begin(void) | ||
1747 | { | ||
1748 | } | ||
1749 | |||
1750 | static inline void irq_time_write_end(void) | ||
1751 | { | ||
1752 | } | ||
1753 | 1762 | ||
1754 | static inline u64 irq_time_read(int cpu) | 1763 | return total; |
1755 | { | ||
1756 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1757 | } | 1764 | } |
1758 | #endif /* CONFIG_64BIT */ | ||
1759 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1765 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1760 | 1766 | ||
1761 | #ifdef CONFIG_CPU_FREQ | 1767 | #ifdef CONFIG_CPU_FREQ |
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
1763 | 1769 | ||
1764 | /** | 1770 | /** |
1765 | * cpufreq_update_util - Take a note about CPU utilization changes. | 1771 | * cpufreq_update_util - Take a note about CPU utilization changes. |
1766 | * @time: Current time. | 1772 | * @rq: Runqueue to carry out the update for. |
1767 | * @util: Current utilization. | 1773 | * @flags: Update reason flags. |
1768 | * @max: Utilization ceiling. | ||
1769 | * | 1774 | * |
1770 | * This function is called by the scheduler on every invocation of | 1775 | * This function is called by the scheduler on the CPU whose utilization is |
1771 | * update_load_avg() on the CPU whose utilization is being updated. | 1776 | * being updated. |
1772 | * | 1777 | * |
1773 | * It can only be called from RCU-sched read-side critical sections. | 1778 | * It can only be called from RCU-sched read-side critical sections. |
1774 | */ | ||
1775 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) | ||
1776 | { | ||
1777 | struct update_util_data *data; | ||
1778 | |||
1779 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
1780 | if (data) | ||
1781 | data->func(data, time, util, max); | ||
1782 | } | ||
1783 | |||
1784 | /** | ||
1785 | * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. | ||
1786 | * @time: Current time. | ||
1787 | * | 1779 | * |
1788 | * The way cpufreq is currently arranged requires it to evaluate the CPU | 1780 | * The way cpufreq is currently arranged requires it to evaluate the CPU |
1789 | * performance state (frequency/voltage) on a regular basis to prevent it from | 1781 | * performance state (frequency/voltage) on a regular basis to prevent it from |
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo | |||
1797 | * but that really is a band-aid. Going forward it should be replaced with | 1789 | * but that really is a band-aid. Going forward it should be replaced with |
1798 | * solutions targeted more specifically at RT and DL tasks. | 1790 | * solutions targeted more specifically at RT and DL tasks. |
1799 | */ | 1791 | */ |
1800 | static inline void cpufreq_trigger_update(u64 time) | 1792 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) |
1793 | { | ||
1794 | struct update_util_data *data; | ||
1795 | |||
1796 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
1797 | if (data) | ||
1798 | data->func(data, rq_clock(rq), flags); | ||
1799 | } | ||
1800 | |||
1801 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) | ||
1801 | { | 1802 | { |
1802 | cpufreq_update_util(time, ULONG_MAX, 0); | 1803 | if (cpu_of(rq) == smp_processor_id()) |
1804 | cpufreq_update_util(rq, flags); | ||
1803 | } | 1805 | } |
1804 | #else | 1806 | #else |
1805 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} | 1807 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} |
1806 | static inline void cpufreq_trigger_update(u64 time) {} | 1808 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} |
1807 | #endif /* CONFIG_CPU_FREQ */ | 1809 | #endif /* CONFIG_CPU_FREQ */ |
1808 | 1810 | ||
1809 | #ifdef arch_scale_freq_capacity | 1811 | #ifdef arch_scale_freq_capacity |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
29 | if (rq) | 29 | if (rq) |
30 | rq->rq_sched_info.run_delay += delta; | 30 | rq->rq_sched_info.run_delay += delta; |
31 | } | 31 | } |
32 | # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 32 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) | 33 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) | 34 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 35 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
36 | # define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) | 36 | #define schedstat_val(var) (var) |
37 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | ||
37 | 38 | ||
38 | #else /* !CONFIG_SCHEDSTATS */ | 39 | #else /* !CONFIG_SCHEDSTATS */ |
39 | static inline void | 40 | static inline void |
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
45 | static inline void | 46 | static inline void |
46 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 47 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
47 | {} | 48 | {} |
48 | # define schedstat_enabled() 0 | 49 | #define schedstat_enabled() 0 |
49 | # define schedstat_inc(rq, field) do { } while (0) | 50 | #define schedstat_inc(var) do { } while (0) |
50 | # define schedstat_add(rq, field, amt) do { } while (0) | 51 | #define schedstat_add(var, amt) do { } while (0) |
51 | # define schedstat_set(var, val) do { } while (0) | 52 | #define schedstat_set(var, val) do { } while (0) |
52 | # define schedstat_val(rq, field) 0 | 53 | #define schedstat_val(var) 0 |
53 | #endif | 54 | #define schedstat_val_or_zero(var) 0 |
55 | #endif /* CONFIG_SCHEDSTATS */ | ||
54 | 56 | ||
55 | #ifdef CONFIG_SCHED_INFO | 57 | #ifdef CONFIG_SCHED_INFO |
56 | static inline void sched_info_reset_dequeued(struct task_struct *t) | 58 | static inline void sched_info_reset_dequeued(struct task_struct *t) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
196 | } | 196 | } |
197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
198 | 198 | ||
199 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | 199 | void init_wait_entry(wait_queue_t *wait, int flags) |
200 | { | 200 | { |
201 | unsigned long flags; | 201 | wait->flags = flags; |
202 | |||
203 | if (signal_pending_state(state, current)) | ||
204 | return -ERESTARTSYS; | ||
205 | |||
206 | wait->private = current; | 202 | wait->private = current; |
207 | wait->func = autoremove_wake_function; | 203 | wait->func = autoremove_wake_function; |
204 | INIT_LIST_HEAD(&wait->task_list); | ||
205 | } | ||
206 | EXPORT_SYMBOL(init_wait_entry); | ||
207 | |||
208 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
209 | { | ||
210 | unsigned long flags; | ||
211 | long ret = 0; | ||
208 | 212 | ||
209 | spin_lock_irqsave(&q->lock, flags); | 213 | spin_lock_irqsave(&q->lock, flags); |
210 | if (list_empty(&wait->task_list)) { | 214 | if (unlikely(signal_pending_state(state, current))) { |
211 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | 215 | /* |
212 | __add_wait_queue_tail(q, wait); | 216 | * Exclusive waiter must not fail if it was selected by wakeup, |
213 | else | 217 | * it should "consume" the condition we were waiting for. |
214 | __add_wait_queue(q, wait); | 218 | * |
219 | * The caller will recheck the condition and return success if | ||
220 | * we were already woken up, we can not miss the event because | ||
221 | * wakeup locks/unlocks the same q->lock. | ||
222 | * | ||
223 | * But we need to ensure that set-condition + wakeup after that | ||
224 | * can't see us, it should wake up another exclusive waiter if | ||
225 | * we fail. | ||
226 | */ | ||
227 | list_del_init(&wait->task_list); | ||
228 | ret = -ERESTARTSYS; | ||
229 | } else { | ||
230 | if (list_empty(&wait->task_list)) { | ||
231 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
232 | __add_wait_queue_tail(q, wait); | ||
233 | else | ||
234 | __add_wait_queue(q, wait); | ||
235 | } | ||
236 | set_current_state(state); | ||
215 | } | 237 | } |
216 | set_current_state(state); | ||
217 | spin_unlock_irqrestore(&q->lock, flags); | 238 | spin_unlock_irqrestore(&q->lock, flags); |
218 | 239 | ||
219 | return 0; | 240 | return ret; |
220 | } | 241 | } |
221 | EXPORT_SYMBOL(prepare_to_wait_event); | 242 | EXPORT_SYMBOL(prepare_to_wait_event); |
222 | 243 | ||
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
255 | } | 276 | } |
256 | EXPORT_SYMBOL(finish_wait); | 277 | EXPORT_SYMBOL(finish_wait); |
257 | 278 | ||
258 | /** | ||
259 | * abort_exclusive_wait - abort exclusive waiting in a queue | ||
260 | * @q: waitqueue waited on | ||
261 | * @wait: wait descriptor | ||
262 | * @mode: runstate of the waiter to be woken | ||
263 | * @key: key to identify a wait bit queue or %NULL | ||
264 | * | ||
265 | * Sets current thread back to running state and removes | ||
266 | * the wait descriptor from the given waitqueue if still | ||
267 | * queued. | ||
268 | * | ||
269 | * Wakes up the next waiter if the caller is concurrently | ||
270 | * woken up through the queue. | ||
271 | * | ||
272 | * This prevents waiter starvation where an exclusive waiter | ||
273 | * aborts and is woken up concurrently and no one wakes up | ||
274 | * the next waiter. | ||
275 | */ | ||
276 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | ||
277 | unsigned int mode, void *key) | ||
278 | { | ||
279 | unsigned long flags; | ||
280 | |||
281 | __set_current_state(TASK_RUNNING); | ||
282 | spin_lock_irqsave(&q->lock, flags); | ||
283 | if (!list_empty(&wait->task_list)) | ||
284 | list_del_init(&wait->task_list); | ||
285 | else if (waitqueue_active(q)) | ||
286 | __wake_up_locked_key(q, mode, key); | ||
287 | spin_unlock_irqrestore(&q->lock, flags); | ||
288 | } | ||
289 | EXPORT_SYMBOL(abort_exclusive_wait); | ||
290 | |||
291 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | 279 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) |
292 | { | 280 | { |
293 | int ret = default_wake_function(wait, mode, sync, key); | 281 | int ret = default_wake_function(wait, mode, sync, key); |
@@ -425,20 +413,29 @@ int __sched | |||
425 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 413 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
426 | wait_bit_action_f *action, unsigned mode) | 414 | wait_bit_action_f *action, unsigned mode) |
427 | { | 415 | { |
428 | do { | 416 | int ret = 0; |
429 | int ret; | ||
430 | 417 | ||
418 | for (;;) { | ||
431 | prepare_to_wait_exclusive(wq, &q->wait, mode); | 419 | prepare_to_wait_exclusive(wq, &q->wait, mode); |
432 | if (!test_bit(q->key.bit_nr, q->key.flags)) | 420 | if (test_bit(q->key.bit_nr, q->key.flags)) { |
433 | continue; | 421 | ret = action(&q->key, mode); |
434 | ret = action(&q->key, mode); | 422 | /* |
435 | if (!ret) | 423 | * See the comment in prepare_to_wait_event(). |
436 | continue; | 424 | * finish_wait() does not necessarily takes wq->lock, |
437 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); | 425 | * but test_and_set_bit() implies mb() which pairs with |
438 | return ret; | 426 | * smp_mb__after_atomic() before wake_up_page(). |
439 | } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); | 427 | */ |
440 | finish_wait(wq, &q->wait); | 428 | if (ret) |
441 | return 0; | 429 | finish_wait(wq, &q->wait); |
430 | } | ||
431 | if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { | ||
432 | if (!ret) | ||
433 | finish_wait(wq, &q->wait); | ||
434 | return 0; | ||
435 | } else if (ret) { | ||
436 | return ret; | ||
437 | } | ||
438 | } | ||
442 | } | 439 | } |
443 | EXPORT_SYMBOL(__wait_on_bit_lock); | 440 | EXPORT_SYMBOL(__wait_on_bit_lock); |
444 | 441 | ||
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit) | |||
483 | } | 480 | } |
484 | EXPORT_SYMBOL(wake_up_bit); | 481 | EXPORT_SYMBOL(wake_up_bit); |
485 | 482 | ||
486 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
487 | { | ||
488 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
489 | const struct zone *zone = page_zone(virt_to_page(word)); | ||
490 | unsigned long val = (unsigned long)word << shift | bit; | ||
491 | |||
492 | return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; | ||
493 | } | ||
494 | EXPORT_SYMBOL(bit_waitqueue); | ||
495 | |||
496 | /* | 483 | /* |
497 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | 484 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash |
498 | * index (we're keying off bit -1, but that would produce a horrible hash | 485 | * index (we're keying off bit -1, but that would produce a horrible hash |