diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 2333 |
1 files changed, 377 insertions, 1956 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..34e2291a9a6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1,88 +1,28 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/core.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Core kernel scheduler code and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | ||
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | ||
9 | * make semaphores SMP safe | ||
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | ||
11 | * by Andrea Arcangeli | ||
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | ||
13 | * hybrid priority-list and round-robin design with | ||
14 | * an array-switch method of distributing timeslices | ||
15 | * and per-CPU runqueues. Cleanups and useful suggestions | ||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | ||
20 | * fair scheduling design by Con Kolivas. | ||
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | ||
22 | * by Peter Williams | ||
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | ||
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | ||
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
27 | */ | 7 | */ |
28 | 8 | #include <linux/sched.h> | |
29 | #include <linux/kasan.h> | ||
30 | #include <linux/mm.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/uaccess.h> | ||
35 | #include <linux/highmem.h> | ||
36 | #include <linux/mmu_context.h> | ||
37 | #include <linux/interrupt.h> | ||
38 | #include <linux/capability.h> | ||
39 | #include <linux/completion.h> | ||
40 | #include <linux/kernel_stat.h> | ||
41 | #include <linux/debug_locks.h> | ||
42 | #include <linux/perf_event.h> | ||
43 | #include <linux/security.h> | ||
44 | #include <linux/notifier.h> | ||
45 | #include <linux/profile.h> | ||
46 | #include <linux/freezer.h> | ||
47 | #include <linux/vmalloc.h> | ||
48 | #include <linux/blkdev.h> | ||
49 | #include <linux/delay.h> | ||
50 | #include <linux/pid_namespace.h> | ||
51 | #include <linux/smp.h> | ||
52 | #include <linux/threads.h> | ||
53 | #include <linux/timer.h> | ||
54 | #include <linux/rcupdate.h> | ||
55 | #include <linux/cpu.h> | ||
56 | #include <linux/cpuset.h> | 9 | #include <linux/cpuset.h> |
57 | #include <linux/percpu.h> | ||
58 | #include <linux/proc_fs.h> | ||
59 | #include <linux/seq_file.h> | ||
60 | #include <linux/sysctl.h> | ||
61 | #include <linux/syscalls.h> | ||
62 | #include <linux/times.h> | ||
63 | #include <linux/tsacct_kern.h> | ||
64 | #include <linux/kprobes.h> | ||
65 | #include <linux/delayacct.h> | 10 | #include <linux/delayacct.h> |
66 | #include <linux/unistd.h> | ||
67 | #include <linux/pagemap.h> | ||
68 | #include <linux/hrtimer.h> | ||
69 | #include <linux/tick.h> | ||
70 | #include <linux/ctype.h> | ||
71 | #include <linux/ftrace.h> | ||
72 | #include <linux/slab.h> | ||
73 | #include <linux/init_task.h> | 11 | #include <linux/init_task.h> |
74 | #include <linux/context_tracking.h> | 12 | #include <linux/context_tracking.h> |
75 | #include <linux/compiler.h> | 13 | |
76 | #include <linux/frame.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/kprobes.h> | ||
16 | #include <linux/mmu_context.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/nmi.h> | ||
77 | #include <linux/prefetch.h> | 19 | #include <linux/prefetch.h> |
78 | #include <linux/mutex.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/security.h> | ||
22 | #include <linux/syscalls.h> | ||
79 | 23 | ||
80 | #include <asm/switch_to.h> | 24 | #include <asm/switch_to.h> |
81 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
82 | #include <asm/irq_regs.h> | ||
83 | #ifdef CONFIG_PARAVIRT | ||
84 | #include <asm/paravirt.h> | ||
85 | #endif | ||
86 | 26 | ||
87 | #include "sched.h" | 27 | #include "sched.h" |
88 | #include "../workqueue_internal.h" | 28 | #include "../workqueue_internal.h" |
@@ -91,27 +31,8 @@ | |||
91 | #define CREATE_TRACE_POINTS | 31 | #define CREATE_TRACE_POINTS |
92 | #include <trace/events/sched.h> | 32 | #include <trace/events/sched.h> |
93 | 33 | ||
94 | DEFINE_MUTEX(sched_domains_mutex); | ||
95 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 34 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
96 | 35 | ||
97 | static void update_rq_clock_task(struct rq *rq, s64 delta); | ||
98 | |||
99 | void update_rq_clock(struct rq *rq) | ||
100 | { | ||
101 | s64 delta; | ||
102 | |||
103 | lockdep_assert_held(&rq->lock); | ||
104 | |||
105 | if (rq->clock_skip_update & RQCF_ACT_SKIP) | ||
106 | return; | ||
107 | |||
108 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
109 | if (delta < 0) | ||
110 | return; | ||
111 | rq->clock += delta; | ||
112 | update_rq_clock_task(rq, delta); | ||
113 | } | ||
114 | |||
115 | /* | 36 | /* |
116 | * Debugging: various feature bits | 37 | * Debugging: various feature bits |
117 | */ | 38 | */ |
@@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
140 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | 61 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; |
141 | 62 | ||
142 | /* | 63 | /* |
143 | * period over which we measure -rt task cpu usage in us. | 64 | * period over which we measure -rt task CPU usage in us. |
144 | * default: 1s | 65 | * default: 1s |
145 | */ | 66 | */ |
146 | unsigned int sysctl_sched_rt_period = 1000000; | 67 | unsigned int sysctl_sched_rt_period = 1000000; |
@@ -153,7 +74,7 @@ __read_mostly int scheduler_running; | |||
153 | */ | 74 | */ |
154 | int sysctl_sched_rt_runtime = 950000; | 75 | int sysctl_sched_rt_runtime = 950000; |
155 | 76 | ||
156 | /* cpus with isolated domains */ | 77 | /* CPUs with isolated domains */ |
157 | cpumask_var_t cpu_isolated_map; | 78 | cpumask_var_t cpu_isolated_map; |
158 | 79 | ||
159 | /* | 80 | /* |
@@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
185 | rq = task_rq(p); | 106 | rq = task_rq(p); |
186 | raw_spin_lock(&rq->lock); | 107 | raw_spin_lock(&rq->lock); |
187 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 108 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
188 | rf->cookie = lockdep_pin_lock(&rq->lock); | 109 | rq_pin_lock(rq, rf); |
189 | return rq; | 110 | return rq; |
190 | } | 111 | } |
191 | raw_spin_unlock(&rq->lock); | 112 | raw_spin_unlock(&rq->lock); |
@@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
221 | * If we observe the old cpu in task_rq_lock, the acquire of | 142 | * If we observe the old cpu in task_rq_lock, the acquire of |
222 | * the old rq->lock will fully serialize against the stores. | 143 | * the old rq->lock will fully serialize against the stores. |
223 | * | 144 | * |
224 | * If we observe the new cpu in task_rq_lock, the acquire will | 145 | * If we observe the new CPU in task_rq_lock, the acquire will |
225 | * pair with the WMB to ensure we must then also see migrating. | 146 | * pair with the WMB to ensure we must then also see migrating. |
226 | */ | 147 | */ |
227 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 148 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
228 | rf->cookie = lockdep_pin_lock(&rq->lock); | 149 | rq_pin_lock(rq, rf); |
229 | return rq; | 150 | return rq; |
230 | } | 151 | } |
231 | raw_spin_unlock(&rq->lock); | 152 | raw_spin_unlock(&rq->lock); |
@@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
236 | } | 157 | } |
237 | } | 158 | } |
238 | 159 | ||
160 | /* | ||
161 | * RQ-clock updating methods: | ||
162 | */ | ||
163 | |||
164 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
165 | { | ||
166 | /* | ||
167 | * In theory, the compile should just see 0 here, and optimize out the call | ||
168 | * to sched_rt_avg_update. But I don't trust it... | ||
169 | */ | ||
170 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
171 | s64 steal = 0, irq_delta = 0; | ||
172 | #endif | ||
173 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
174 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
175 | |||
176 | /* | ||
177 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
178 | * this case when a previous update_rq_clock() happened inside a | ||
179 | * {soft,}irq region. | ||
180 | * | ||
181 | * When this happens, we stop ->clock_task and only update the | ||
182 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
183 | * update will consume the rest. This ensures ->clock_task is | ||
184 | * monotonic. | ||
185 | * | ||
186 | * It does however cause some slight miss-attribution of {soft,}irq | ||
187 | * time, a more accurate solution would be to update the irq_time using | ||
188 | * the current rq->clock timestamp, except that would require using | ||
189 | * atomic ops. | ||
190 | */ | ||
191 | if (irq_delta > delta) | ||
192 | irq_delta = delta; | ||
193 | |||
194 | rq->prev_irq_time += irq_delta; | ||
195 | delta -= irq_delta; | ||
196 | #endif | ||
197 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
198 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
199 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
200 | steal -= rq->prev_steal_time_rq; | ||
201 | |||
202 | if (unlikely(steal > delta)) | ||
203 | steal = delta; | ||
204 | |||
205 | rq->prev_steal_time_rq += steal; | ||
206 | delta -= steal; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | rq->clock_task += delta; | ||
211 | |||
212 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
213 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
214 | sched_rt_avg_update(rq, irq_delta + steal); | ||
215 | #endif | ||
216 | } | ||
217 | |||
218 | void update_rq_clock(struct rq *rq) | ||
219 | { | ||
220 | s64 delta; | ||
221 | |||
222 | lockdep_assert_held(&rq->lock); | ||
223 | |||
224 | if (rq->clock_update_flags & RQCF_ACT_SKIP) | ||
225 | return; | ||
226 | |||
227 | #ifdef CONFIG_SCHED_DEBUG | ||
228 | rq->clock_update_flags |= RQCF_UPDATED; | ||
229 | #endif | ||
230 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
231 | if (delta < 0) | ||
232 | return; | ||
233 | rq->clock += delta; | ||
234 | update_rq_clock_task(rq, delta); | ||
235 | } | ||
236 | |||
237 | |||
239 | #ifdef CONFIG_SCHED_HRTICK | 238 | #ifdef CONFIG_SCHED_HRTICK |
240 | /* | 239 | /* |
241 | * Use HR-timers to deliver accurate preemption points. | 240 | * Use HR-timers to deliver accurate preemption points. |
@@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head) | |||
458 | 457 | ||
459 | task = container_of(node, struct task_struct, wake_q); | 458 | task = container_of(node, struct task_struct, wake_q); |
460 | BUG_ON(!task); | 459 | BUG_ON(!task); |
461 | /* task can safely be re-inserted now */ | 460 | /* Task can safely be re-inserted now: */ |
462 | node = node->next; | 461 | node = node->next; |
463 | task->wake_q.next = NULL; | 462 | task->wake_q.next = NULL; |
464 | 463 | ||
@@ -516,12 +515,12 @@ void resched_cpu(int cpu) | |||
516 | #ifdef CONFIG_SMP | 515 | #ifdef CONFIG_SMP |
517 | #ifdef CONFIG_NO_HZ_COMMON | 516 | #ifdef CONFIG_NO_HZ_COMMON |
518 | /* | 517 | /* |
519 | * In the semi idle case, use the nearest busy cpu for migrating timers | 518 | * In the semi idle case, use the nearest busy CPU for migrating timers |
520 | * from an idle cpu. This is good for power-savings. | 519 | * from an idle CPU. This is good for power-savings. |
521 | * | 520 | * |
522 | * We don't do similar optimization for completely idle system, as | 521 | * We don't do similar optimization for completely idle system, as |
523 | * selecting an idle cpu will add more delays to the timers than intended | 522 | * selecting an idle CPU will add more delays to the timers than intended |
524 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | 523 | * (as that CPU's timer base may not be uptodate wrt jiffies etc). |
525 | */ | 524 | */ |
526 | int get_nohz_timer_target(void) | 525 | int get_nohz_timer_target(void) |
527 | { | 526 | { |
@@ -550,6 +549,7 @@ unlock: | |||
550 | rcu_read_unlock(); | 549 | rcu_read_unlock(); |
551 | return cpu; | 550 | return cpu; |
552 | } | 551 | } |
552 | |||
553 | /* | 553 | /* |
554 | * When add_timer_on() enqueues a timer into the timer wheel of an | 554 | * When add_timer_on() enqueues a timer into the timer wheel of an |
555 | * idle CPU then this timer might expire before the next timer event | 555 | * idle CPU then this timer might expire before the next timer event |
@@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
784 | dequeue_task(rq, p, flags); | 784 | dequeue_task(rq, p, flags); |
785 | } | 785 | } |
786 | 786 | ||
787 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
788 | { | ||
789 | /* | ||
790 | * In theory, the compile should just see 0 here, and optimize out the call | ||
791 | * to sched_rt_avg_update. But I don't trust it... | ||
792 | */ | ||
793 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
794 | s64 steal = 0, irq_delta = 0; | ||
795 | #endif | ||
796 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
797 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
798 | |||
799 | /* | ||
800 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
801 | * this case when a previous update_rq_clock() happened inside a | ||
802 | * {soft,}irq region. | ||
803 | * | ||
804 | * When this happens, we stop ->clock_task and only update the | ||
805 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
806 | * update will consume the rest. This ensures ->clock_task is | ||
807 | * monotonic. | ||
808 | * | ||
809 | * It does however cause some slight miss-attribution of {soft,}irq | ||
810 | * time, a more accurate solution would be to update the irq_time using | ||
811 | * the current rq->clock timestamp, except that would require using | ||
812 | * atomic ops. | ||
813 | */ | ||
814 | if (irq_delta > delta) | ||
815 | irq_delta = delta; | ||
816 | |||
817 | rq->prev_irq_time += irq_delta; | ||
818 | delta -= irq_delta; | ||
819 | #endif | ||
820 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
821 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
822 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
823 | steal -= rq->prev_steal_time_rq; | ||
824 | |||
825 | if (unlikely(steal > delta)) | ||
826 | steal = delta; | ||
827 | |||
828 | rq->prev_steal_time_rq += steal; | ||
829 | delta -= steal; | ||
830 | } | ||
831 | #endif | ||
832 | |||
833 | rq->clock_task += delta; | ||
834 | |||
835 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
836 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
837 | sched_rt_avg_update(rq, irq_delta + steal); | ||
838 | #endif | ||
839 | } | ||
840 | |||
841 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 787 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
842 | { | 788 | { |
843 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 789 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -1018,7 +964,7 @@ struct migration_arg { | |||
1018 | }; | 964 | }; |
1019 | 965 | ||
1020 | /* | 966 | /* |
1021 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 967 | * Move (not current) task off this CPU, onto the destination CPU. We're doing |
1022 | * this because either it can't run here any more (set_cpus_allowed() | 968 | * this because either it can't run here any more (set_cpus_allowed() |
1023 | * away from this CPU, or CPU going down), or because we're | 969 | * away from this CPU, or CPU going down), or because we're |
1024 | * attempting to rebalance this task on exec (sched_exec). | 970 | * attempting to rebalance this task on exec (sched_exec). |
@@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data) | |||
1052 | struct rq *rq = this_rq(); | 998 | struct rq *rq = this_rq(); |
1053 | 999 | ||
1054 | /* | 1000 | /* |
1055 | * The original target cpu might have gone down and we might | 1001 | * The original target CPU might have gone down and we might |
1056 | * be on another cpu but it doesn't matter. | 1002 | * be on another CPU but it doesn't matter. |
1057 | */ | 1003 | */ |
1058 | local_irq_disable(); | 1004 | local_irq_disable(); |
1059 | /* | 1005 | /* |
@@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1171 | if (p->flags & PF_KTHREAD) { | 1117 | if (p->flags & PF_KTHREAD) { |
1172 | /* | 1118 | /* |
1173 | * For kernel threads that do indeed end up on online && | 1119 | * For kernel threads that do indeed end up on online && |
1174 | * !active we want to ensure they are strict per-cpu threads. | 1120 | * !active we want to ensure they are strict per-CPU threads. |
1175 | */ | 1121 | */ |
1176 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && | 1122 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && |
1177 | !cpumask_intersects(new_mask, cpu_active_mask) && | 1123 | !cpumask_intersects(new_mask, cpu_active_mask) && |
@@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1195 | * OK, since we're going to drop the lock immediately | 1141 | * OK, since we're going to drop the lock immediately |
1196 | * afterwards anyway. | 1142 | * afterwards anyway. |
1197 | */ | 1143 | */ |
1198 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 1144 | rq_unpin_lock(rq, &rf); |
1199 | rq = move_queued_task(rq, p, dest_cpu); | 1145 | rq = move_queued_task(rq, p, dest_cpu); |
1200 | lockdep_repin_lock(&rq->lock, rf.cookie); | 1146 | rq_repin_lock(rq, &rf); |
1201 | } | 1147 | } |
1202 | out: | 1148 | out: |
1203 | task_rq_unlock(rq, p, &rf); | 1149 | task_rq_unlock(rq, p, &rf); |
@@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1276 | /* | 1222 | /* |
1277 | * Task isn't running anymore; make it appear like we migrated | 1223 | * Task isn't running anymore; make it appear like we migrated |
1278 | * it before it went to sleep. This means on wakeup we make the | 1224 | * it before it went to sleep. This means on wakeup we make the |
1279 | * previous cpu our target instead of where it really is. | 1225 | * previous CPU our target instead of where it really is. |
1280 | */ | 1226 | */ |
1281 | p->wake_cpu = cpu; | 1227 | p->wake_cpu = cpu; |
1282 | } | 1228 | } |
@@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1508 | * | 1454 | * |
1509 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1455 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, |
1510 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1456 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1511 | * cpu isn't yet part of the sched domains, and balancing will not | 1457 | * CPU isn't yet part of the sched domains, and balancing will not |
1512 | * see it. | 1458 | * see it. |
1513 | * | 1459 | * |
1514 | * - on cpu-down we clear cpu_active() to mask the sched domains and | 1460 | * - on CPU-down we clear cpu_active() to mask the sched domains and |
1515 | * avoid the load balancer to place new tasks on the to be removed | 1461 | * avoid the load balancer to place new tasks on the to be removed |
1516 | * cpu. Existing tasks will remain running there and will be taken | 1462 | * CPU. Existing tasks will remain running there and will be taken |
1517 | * off. | 1463 | * off. |
1518 | * | 1464 | * |
1519 | * This means that fallback selection must not select !active CPUs. | 1465 | * This means that fallback selection must not select !active CPUs. |
@@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1529 | int dest_cpu; | 1475 | int dest_cpu; |
1530 | 1476 | ||
1531 | /* | 1477 | /* |
1532 | * If the node that the cpu is on has been offlined, cpu_to_node() | 1478 | * If the node that the CPU is on has been offlined, cpu_to_node() |
1533 | * will return -1. There is no cpu on the node, and we should | 1479 | * will return -1. There is no CPU on the node, and we should |
1534 | * select the cpu on the other node. | 1480 | * select the CPU on the other node. |
1535 | */ | 1481 | */ |
1536 | if (nid != -1) { | 1482 | if (nid != -1) { |
1537 | nodemask = cpumask_of_node(nid); | 1483 | nodemask = cpumask_of_node(nid); |
@@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1563 | state = possible; | 1509 | state = possible; |
1564 | break; | 1510 | break; |
1565 | } | 1511 | } |
1566 | /* fall-through */ | 1512 | /* Fall-through */ |
1567 | case possible: | 1513 | case possible: |
1568 | do_set_cpus_allowed(p, cpu_possible_mask); | 1514 | do_set_cpus_allowed(p, cpu_possible_mask); |
1569 | state = fail; | 1515 | state = fail; |
@@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |||
1607 | /* | 1553 | /* |
1608 | * In order not to call set_task_cpu() on a blocking task we need | 1554 | * In order not to call set_task_cpu() on a blocking task we need |
1609 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 1555 | * to rely on ttwu() to place the task on a valid ->cpus_allowed |
1610 | * cpu. | 1556 | * CPU. |
1611 | * | 1557 | * |
1612 | * Since this is common to all placement strategies, this lives here. | 1558 | * Since this is common to all placement strategies, this lives here. |
1613 | * | 1559 | * |
@@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1681 | activate_task(rq, p, en_flags); | 1627 | activate_task(rq, p, en_flags); |
1682 | p->on_rq = TASK_ON_RQ_QUEUED; | 1628 | p->on_rq = TASK_ON_RQ_QUEUED; |
1683 | 1629 | ||
1684 | /* if a worker is waking up, notify workqueue */ | 1630 | /* If a worker is waking up, notify the workqueue: */ |
1685 | if (p->flags & PF_WQ_WORKER) | 1631 | if (p->flags & PF_WQ_WORKER) |
1686 | wq_worker_waking_up(p, cpu_of(rq)); | 1632 | wq_worker_waking_up(p, cpu_of(rq)); |
1687 | } | 1633 | } |
@@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1690 | * Mark the task runnable and perform wakeup-preemption. | 1636 | * Mark the task runnable and perform wakeup-preemption. |
1691 | */ | 1637 | */ |
1692 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | 1638 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, |
1693 | struct pin_cookie cookie) | 1639 | struct rq_flags *rf) |
1694 | { | 1640 | { |
1695 | check_preempt_curr(rq, p, wake_flags); | 1641 | check_preempt_curr(rq, p, wake_flags); |
1696 | p->state = TASK_RUNNING; | 1642 | p->state = TASK_RUNNING; |
@@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1702 | * Our task @p is fully woken up and running; so its safe to | 1648 | * Our task @p is fully woken up and running; so its safe to |
1703 | * drop the rq->lock, hereafter rq is only used for statistics. | 1649 | * drop the rq->lock, hereafter rq is only used for statistics. |
1704 | */ | 1650 | */ |
1705 | lockdep_unpin_lock(&rq->lock, cookie); | 1651 | rq_unpin_lock(rq, rf); |
1706 | p->sched_class->task_woken(rq, p); | 1652 | p->sched_class->task_woken(rq, p); |
1707 | lockdep_repin_lock(&rq->lock, cookie); | 1653 | rq_repin_lock(rq, rf); |
1708 | } | 1654 | } |
1709 | 1655 | ||
1710 | if (rq->idle_stamp) { | 1656 | if (rq->idle_stamp) { |
@@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1723 | 1669 | ||
1724 | static void | 1670 | static void |
1725 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | 1671 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, |
1726 | struct pin_cookie cookie) | 1672 | struct rq_flags *rf) |
1727 | { | 1673 | { |
1728 | int en_flags = ENQUEUE_WAKEUP; | 1674 | int en_flags = ENQUEUE_WAKEUP; |
1729 | 1675 | ||
@@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1738 | #endif | 1684 | #endif |
1739 | 1685 | ||
1740 | ttwu_activate(rq, p, en_flags); | 1686 | ttwu_activate(rq, p, en_flags); |
1741 | ttwu_do_wakeup(rq, p, wake_flags, cookie); | 1687 | ttwu_do_wakeup(rq, p, wake_flags, rf); |
1742 | } | 1688 | } |
1743 | 1689 | ||
1744 | /* | 1690 | /* |
@@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1757 | if (task_on_rq_queued(p)) { | 1703 | if (task_on_rq_queued(p)) { |
1758 | /* check_preempt_curr() may use rq clock */ | 1704 | /* check_preempt_curr() may use rq clock */ |
1759 | update_rq_clock(rq); | 1705 | update_rq_clock(rq); |
1760 | ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); | 1706 | ttwu_do_wakeup(rq, p, wake_flags, &rf); |
1761 | ret = 1; | 1707 | ret = 1; |
1762 | } | 1708 | } |
1763 | __task_rq_unlock(rq, &rf); | 1709 | __task_rq_unlock(rq, &rf); |
@@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void) | |||
1770 | { | 1716 | { |
1771 | struct rq *rq = this_rq(); | 1717 | struct rq *rq = this_rq(); |
1772 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1718 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1773 | struct pin_cookie cookie; | ||
1774 | struct task_struct *p; | 1719 | struct task_struct *p; |
1775 | unsigned long flags; | 1720 | unsigned long flags; |
1721 | struct rq_flags rf; | ||
1776 | 1722 | ||
1777 | if (!llist) | 1723 | if (!llist) |
1778 | return; | 1724 | return; |
1779 | 1725 | ||
1780 | raw_spin_lock_irqsave(&rq->lock, flags); | 1726 | raw_spin_lock_irqsave(&rq->lock, flags); |
1781 | cookie = lockdep_pin_lock(&rq->lock); | 1727 | rq_pin_lock(rq, &rf); |
1782 | 1728 | ||
1783 | while (llist) { | 1729 | while (llist) { |
1784 | int wake_flags = 0; | 1730 | int wake_flags = 0; |
@@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void) | |||
1789 | if (p->sched_remote_wakeup) | 1735 | if (p->sched_remote_wakeup) |
1790 | wake_flags = WF_MIGRATED; | 1736 | wake_flags = WF_MIGRATED; |
1791 | 1737 | ||
1792 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1738 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1793 | } | 1739 | } |
1794 | 1740 | ||
1795 | lockdep_unpin_lock(&rq->lock, cookie); | 1741 | rq_unpin_lock(rq, &rf); |
1796 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1742 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1797 | } | 1743 | } |
1798 | 1744 | ||
@@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu) | |||
1864 | raw_spin_lock_irqsave(&rq->lock, flags); | 1810 | raw_spin_lock_irqsave(&rq->lock, flags); |
1865 | if (is_idle_task(rq->curr)) | 1811 | if (is_idle_task(rq->curr)) |
1866 | smp_send_reschedule(cpu); | 1812 | smp_send_reschedule(cpu); |
1867 | /* Else cpu is not in idle, do nothing here */ | 1813 | /* Else CPU is not idle, do nothing here: */ |
1868 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1814 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1869 | } | 1815 | } |
1870 | 1816 | ||
@@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) | |||
1881 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | 1827 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) |
1882 | { | 1828 | { |
1883 | struct rq *rq = cpu_rq(cpu); | 1829 | struct rq *rq = cpu_rq(cpu); |
1884 | struct pin_cookie cookie; | 1830 | struct rq_flags rf; |
1885 | 1831 | ||
1886 | #if defined(CONFIG_SMP) | 1832 | #if defined(CONFIG_SMP) |
1887 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { | 1833 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
1888 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1834 | sched_clock_cpu(cpu); /* Sync clocks across CPUs */ |
1889 | ttwu_queue_remote(p, cpu, wake_flags); | 1835 | ttwu_queue_remote(p, cpu, wake_flags); |
1890 | return; | 1836 | return; |
1891 | } | 1837 | } |
1892 | #endif | 1838 | #endif |
1893 | 1839 | ||
1894 | raw_spin_lock(&rq->lock); | 1840 | raw_spin_lock(&rq->lock); |
1895 | cookie = lockdep_pin_lock(&rq->lock); | 1841 | rq_pin_lock(rq, &rf); |
1896 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1842 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1897 | lockdep_unpin_lock(&rq->lock, cookie); | 1843 | rq_unpin_lock(rq, &rf); |
1898 | raw_spin_unlock(&rq->lock); | 1844 | raw_spin_unlock(&rq->lock); |
1899 | } | 1845 | } |
1900 | 1846 | ||
@@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1904 | * MIGRATION | 1850 | * MIGRATION |
1905 | * | 1851 | * |
1906 | * The basic program-order guarantee on SMP systems is that when a task [t] | 1852 | * The basic program-order guarantee on SMP systems is that when a task [t] |
1907 | * migrates, all its activity on its old cpu [c0] happens-before any subsequent | 1853 | * migrates, all its activity on its old CPU [c0] happens-before any subsequent |
1908 | * execution on its new cpu [c1]. | 1854 | * execution on its new CPU [c1]. |
1909 | * | 1855 | * |
1910 | * For migration (of runnable tasks) this is provided by the following means: | 1856 | * For migration (of runnable tasks) this is provided by the following means: |
1911 | * | 1857 | * |
@@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1916 | * | 1862 | * |
1917 | * Transitivity guarantees that B happens after A and C after B. | 1863 | * Transitivity guarantees that B happens after A and C after B. |
1918 | * Note: we only require RCpc transitivity. | 1864 | * Note: we only require RCpc transitivity. |
1919 | * Note: the cpu doing B need not be c0 or c1 | 1865 | * Note: the CPU doing B need not be c0 or c1 |
1920 | * | 1866 | * |
1921 | * Example: | 1867 | * Example: |
1922 | * | 1868 | * |
@@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2024 | 1970 | ||
2025 | trace_sched_waking(p); | 1971 | trace_sched_waking(p); |
2026 | 1972 | ||
2027 | success = 1; /* we're going to change ->state */ | 1973 | /* We're going to change ->state: */ |
1974 | success = 1; | ||
2028 | cpu = task_cpu(p); | 1975 | cpu = task_cpu(p); |
2029 | 1976 | ||
2030 | /* | 1977 | /* |
@@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2073 | smp_rmb(); | 2020 | smp_rmb(); |
2074 | 2021 | ||
2075 | /* | 2022 | /* |
2076 | * If the owning (remote) cpu is still in the middle of schedule() with | 2023 | * If the owning (remote) CPU is still in the middle of schedule() with |
2077 | * this task as prev, wait until its done referencing the task. | 2024 | * this task as prev, wait until its done referencing the task. |
2078 | * | 2025 | * |
2079 | * Pairs with the smp_store_release() in finish_lock_switch(). | 2026 | * Pairs with the smp_store_release() in finish_lock_switch(). |
@@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2086 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2033 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2087 | p->state = TASK_WAKING; | 2034 | p->state = TASK_WAKING; |
2088 | 2035 | ||
2036 | if (p->in_iowait) { | ||
2037 | delayacct_blkio_end(); | ||
2038 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2039 | } | ||
2040 | |||
2089 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | 2041 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
2090 | if (task_cpu(p) != cpu) { | 2042 | if (task_cpu(p) != cpu) { |
2091 | wake_flags |= WF_MIGRATED; | 2043 | wake_flags |= WF_MIGRATED; |
2092 | set_task_cpu(p, cpu); | 2044 | set_task_cpu(p, cpu); |
2093 | } | 2045 | } |
2046 | |||
2047 | #else /* CONFIG_SMP */ | ||
2048 | |||
2049 | if (p->in_iowait) { | ||
2050 | delayacct_blkio_end(); | ||
2051 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2052 | } | ||
2053 | |||
2094 | #endif /* CONFIG_SMP */ | 2054 | #endif /* CONFIG_SMP */ |
2095 | 2055 | ||
2096 | ttwu_queue(p, cpu, wake_flags); | 2056 | ttwu_queue(p, cpu, wake_flags); |
@@ -2111,7 +2071,7 @@ out: | |||
2111 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2071 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2112 | * the current task. | 2072 | * the current task. |
2113 | */ | 2073 | */ |
2114 | static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) | 2074 | static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) |
2115 | { | 2075 | { |
2116 | struct rq *rq = task_rq(p); | 2076 | struct rq *rq = task_rq(p); |
2117 | 2077 | ||
@@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2128 | * disabled avoiding further scheduler activity on it and we've | 2088 | * disabled avoiding further scheduler activity on it and we've |
2129 | * not yet picked a replacement task. | 2089 | * not yet picked a replacement task. |
2130 | */ | 2090 | */ |
2131 | lockdep_unpin_lock(&rq->lock, cookie); | 2091 | rq_unpin_lock(rq, rf); |
2132 | raw_spin_unlock(&rq->lock); | 2092 | raw_spin_unlock(&rq->lock); |
2133 | raw_spin_lock(&p->pi_lock); | 2093 | raw_spin_lock(&p->pi_lock); |
2134 | raw_spin_lock(&rq->lock); | 2094 | raw_spin_lock(&rq->lock); |
2135 | lockdep_repin_lock(&rq->lock, cookie); | 2095 | rq_repin_lock(rq, rf); |
2136 | } | 2096 | } |
2137 | 2097 | ||
2138 | if (!(p->state & TASK_NORMAL)) | 2098 | if (!(p->state & TASK_NORMAL)) |
@@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2140 | 2100 | ||
2141 | trace_sched_waking(p); | 2101 | trace_sched_waking(p); |
2142 | 2102 | ||
2143 | if (!task_on_rq_queued(p)) | 2103 | if (!task_on_rq_queued(p)) { |
2104 | if (p->in_iowait) { | ||
2105 | delayacct_blkio_end(); | ||
2106 | atomic_dec(&rq->nr_iowait); | ||
2107 | } | ||
2144 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2108 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2109 | } | ||
2145 | 2110 | ||
2146 | ttwu_do_wakeup(rq, p, 0, cookie); | 2111 | ttwu_do_wakeup(rq, p, 0, rf); |
2147 | ttwu_stat(p, smp_processor_id(), 0); | 2112 | ttwu_stat(p, smp_processor_id(), 0); |
2148 | out: | 2113 | out: |
2149 | raw_spin_unlock(&p->pi_lock); | 2114 | raw_spin_unlock(&p->pi_lock); |
@@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2427 | */ | 2392 | */ |
2428 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2393 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2429 | /* | 2394 | /* |
2430 | * We're setting the cpu for the first time, we don't migrate, | 2395 | * We're setting the CPU for the first time, we don't migrate, |
2431 | * so use __set_task_cpu(). | 2396 | * so use __set_task_cpu(). |
2432 | */ | 2397 | */ |
2433 | __set_task_cpu(p, cpu); | 2398 | __set_task_cpu(p, cpu); |
@@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2570 | /* | 2535 | /* |
2571 | * Fork balancing, do it here and not earlier because: | 2536 | * Fork balancing, do it here and not earlier because: |
2572 | * - cpus_allowed can change in the fork path | 2537 | * - cpus_allowed can change in the fork path |
2573 | * - any previously selected cpu might disappear through hotplug | 2538 | * - any previously selected CPU might disappear through hotplug |
2574 | * | 2539 | * |
2575 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | 2540 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
2576 | * as we're not fully set-up yet. | 2541 | * as we're not fully set-up yet. |
@@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2578 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2543 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2579 | #endif | 2544 | #endif |
2580 | rq = __task_rq_lock(p, &rf); | 2545 | rq = __task_rq_lock(p, &rf); |
2546 | update_rq_clock(rq); | ||
2581 | post_init_entity_util_avg(&p->se); | 2547 | post_init_entity_util_avg(&p->se); |
2582 | 2548 | ||
2583 | activate_task(rq, p, 0); | 2549 | activate_task(rq, p, 0); |
@@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p) | |||
2590 | * Nothing relies on rq->lock after this, so its fine to | 2556 | * Nothing relies on rq->lock after this, so its fine to |
2591 | * drop it. | 2557 | * drop it. |
2592 | */ | 2558 | */ |
2593 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 2559 | rq_unpin_lock(rq, &rf); |
2594 | p->sched_class->task_woken(rq, p); | 2560 | p->sched_class->task_woken(rq, p); |
2595 | lockdep_repin_lock(&rq->lock, rf.cookie); | 2561 | rq_repin_lock(rq, &rf); |
2596 | } | 2562 | } |
2597 | #endif | 2563 | #endif |
2598 | task_rq_unlock(rq, p, &rf); | 2564 | task_rq_unlock(rq, p, &rf); |
@@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2861 | */ | 2827 | */ |
2862 | static __always_inline struct rq * | 2828 | static __always_inline struct rq * |
2863 | context_switch(struct rq *rq, struct task_struct *prev, | 2829 | context_switch(struct rq *rq, struct task_struct *prev, |
2864 | struct task_struct *next, struct pin_cookie cookie) | 2830 | struct task_struct *next, struct rq_flags *rf) |
2865 | { | 2831 | { |
2866 | struct mm_struct *mm, *oldmm; | 2832 | struct mm_struct *mm, *oldmm; |
2867 | 2833 | ||
@@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2887 | prev->active_mm = NULL; | 2853 | prev->active_mm = NULL; |
2888 | rq->prev_mm = oldmm; | 2854 | rq->prev_mm = oldmm; |
2889 | } | 2855 | } |
2856 | |||
2857 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); | ||
2858 | |||
2890 | /* | 2859 | /* |
2891 | * Since the runqueue lock will be released by the next | 2860 | * Since the runqueue lock will be released by the next |
2892 | * task (which is an invalid locking op but in the case | 2861 | * task (which is an invalid locking op but in the case |
2893 | * of the scheduler it's an obvious special-case), so we | 2862 | * of the scheduler it's an obvious special-case), so we |
2894 | * do an early lockdep release here: | 2863 | * do an early lockdep release here: |
2895 | */ | 2864 | */ |
2896 | lockdep_unpin_lock(&rq->lock, cookie); | 2865 | rq_unpin_lock(rq, rf); |
2897 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2866 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2898 | 2867 | ||
2899 | /* Here we just switch the register state and the stack. */ | 2868 | /* Here we just switch the register state and the stack. */ |
@@ -2920,7 +2889,7 @@ unsigned long nr_running(void) | |||
2920 | } | 2889 | } |
2921 | 2890 | ||
2922 | /* | 2891 | /* |
2923 | * Check if only the current task is running on the cpu. | 2892 | * Check if only the current task is running on the CPU. |
2924 | * | 2893 | * |
2925 | * Caution: this function does not check that the caller has disabled | 2894 | * Caution: this function does not check that the caller has disabled |
2926 | * preemption, thus the result might have a time-of-check-to-time-of-use | 2895 | * preemption, thus the result might have a time-of-check-to-time-of-use |
@@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void) | |||
2949 | return sum; | 2918 | return sum; |
2950 | } | 2919 | } |
2951 | 2920 | ||
2921 | /* | ||
2922 | * IO-wait accounting, and how its mostly bollocks (on SMP). | ||
2923 | * | ||
2924 | * The idea behind IO-wait account is to account the idle time that we could | ||
2925 | * have spend running if it were not for IO. That is, if we were to improve the | ||
2926 | * storage performance, we'd have a proportional reduction in IO-wait time. | ||
2927 | * | ||
2928 | * This all works nicely on UP, where, when a task blocks on IO, we account | ||
2929 | * idle time as IO-wait, because if the storage were faster, it could've been | ||
2930 | * running and we'd not be idle. | ||
2931 | * | ||
2932 | * This has been extended to SMP, by doing the same for each CPU. This however | ||
2933 | * is broken. | ||
2934 | * | ||
2935 | * Imagine for instance the case where two tasks block on one CPU, only the one | ||
2936 | * CPU will have IO-wait accounted, while the other has regular idle. Even | ||
2937 | * though, if the storage were faster, both could've ran at the same time, | ||
2938 | * utilising both CPUs. | ||
2939 | * | ||
2940 | * This means, that when looking globally, the current IO-wait accounting on | ||
2941 | * SMP is a lower bound, by reason of under accounting. | ||
2942 | * | ||
2943 | * Worse, since the numbers are provided per CPU, they are sometimes | ||
2944 | * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly | ||
2945 | * associated with any one particular CPU, it can wake to another CPU than it | ||
2946 | * blocked on. This means the per CPU IO-wait number is meaningless. | ||
2947 | * | ||
2948 | * Task CPU affinities can make all that even more 'interesting'. | ||
2949 | */ | ||
2950 | |||
2952 | unsigned long nr_iowait(void) | 2951 | unsigned long nr_iowait(void) |
2953 | { | 2952 | { |
2954 | unsigned long i, sum = 0; | 2953 | unsigned long i, sum = 0; |
@@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void) | |||
2959 | return sum; | 2958 | return sum; |
2960 | } | 2959 | } |
2961 | 2960 | ||
2961 | /* | ||
2962 | * Consumers of these two interfaces, like for example the cpufreq menu | ||
2963 | * governor are using nonsensical data. Boosting frequency for a CPU that has | ||
2964 | * IO-wait which might not even end up running the task when it does become | ||
2965 | * runnable. | ||
2966 | */ | ||
2967 | |||
2962 | unsigned long nr_iowait_cpu(int cpu) | 2968 | unsigned long nr_iowait_cpu(int cpu) |
2963 | { | 2969 | { |
2964 | struct rq *this = cpu_rq(cpu); | 2970 | struct rq *this = cpu_rq(cpu); |
@@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3042 | * So we have a optimization chance when the task's delta_exec is 0. | 3048 | * So we have a optimization chance when the task's delta_exec is 0. |
3043 | * Reading ->on_cpu is racy, but this is ok. | 3049 | * Reading ->on_cpu is racy, but this is ok. |
3044 | * | 3050 | * |
3045 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | 3051 | * If we race with it leaving CPU, we'll take a lock. So we're correct. |
3046 | * If we race with it entering cpu, unaccounted time is 0. This is | 3052 | * If we race with it entering CPU, unaccounted time is 0. This is |
3047 | * indistinguishable from the read occurring a few cycles earlier. | 3053 | * indistinguishable from the read occurring a few cycles earlier. |
3048 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 3054 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
3049 | * been accounted, so we're correct here as well. | 3055 | * been accounted, so we're correct here as well. |
@@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3257 | * Pick up the highest-prio task: | 3263 | * Pick up the highest-prio task: |
3258 | */ | 3264 | */ |
3259 | static inline struct task_struct * | 3265 | static inline struct task_struct * |
3260 | pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 3266 | pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
3261 | { | 3267 | { |
3262 | const struct sched_class *class = &fair_sched_class; | 3268 | const struct sched_class *class; |
3263 | struct task_struct *p; | 3269 | struct task_struct *p; |
3264 | 3270 | ||
3265 | /* | 3271 | /* |
3266 | * Optimization: we know that if all tasks are in | 3272 | * Optimization: we know that if all tasks are in |
3267 | * the fair class we can call that function directly: | 3273 | * the fair class we can call that function directly: |
3268 | */ | 3274 | */ |
3269 | if (likely(prev->sched_class == class && | 3275 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
3270 | rq->nr_running == rq->cfs.h_nr_running)) { | 3276 | p = fair_sched_class.pick_next_task(rq, prev, rf); |
3271 | p = fair_sched_class.pick_next_task(rq, prev, cookie); | ||
3272 | if (unlikely(p == RETRY_TASK)) | 3277 | if (unlikely(p == RETRY_TASK)) |
3273 | goto again; | 3278 | goto again; |
3274 | 3279 | ||
3275 | /* assumes fair_sched_class->next == idle_sched_class */ | 3280 | /* Assumes fair_sched_class->next == idle_sched_class */ |
3276 | if (unlikely(!p)) | 3281 | if (unlikely(!p)) |
3277 | p = idle_sched_class.pick_next_task(rq, prev, cookie); | 3282 | p = idle_sched_class.pick_next_task(rq, prev, rf); |
3278 | 3283 | ||
3279 | return p; | 3284 | return p; |
3280 | } | 3285 | } |
3281 | 3286 | ||
3282 | again: | 3287 | again: |
3283 | for_each_class(class) { | 3288 | for_each_class(class) { |
3284 | p = class->pick_next_task(rq, prev, cookie); | 3289 | p = class->pick_next_task(rq, prev, rf); |
3285 | if (p) { | 3290 | if (p) { |
3286 | if (unlikely(p == RETRY_TASK)) | 3291 | if (unlikely(p == RETRY_TASK)) |
3287 | goto again; | 3292 | goto again; |
@@ -3289,7 +3294,8 @@ again: | |||
3289 | } | 3294 | } |
3290 | } | 3295 | } |
3291 | 3296 | ||
3292 | BUG(); /* the idle class will always have a runnable task */ | 3297 | /* The idle class should always have a runnable task: */ |
3298 | BUG(); | ||
3293 | } | 3299 | } |
3294 | 3300 | ||
3295 | /* | 3301 | /* |
@@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3335 | { | 3341 | { |
3336 | struct task_struct *prev, *next; | 3342 | struct task_struct *prev, *next; |
3337 | unsigned long *switch_count; | 3343 | unsigned long *switch_count; |
3338 | struct pin_cookie cookie; | 3344 | struct rq_flags rf; |
3339 | struct rq *rq; | 3345 | struct rq *rq; |
3340 | int cpu; | 3346 | int cpu; |
3341 | 3347 | ||
@@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt) | |||
3358 | */ | 3364 | */ |
3359 | smp_mb__before_spinlock(); | 3365 | smp_mb__before_spinlock(); |
3360 | raw_spin_lock(&rq->lock); | 3366 | raw_spin_lock(&rq->lock); |
3361 | cookie = lockdep_pin_lock(&rq->lock); | 3367 | rq_pin_lock(rq, &rf); |
3362 | 3368 | ||
3363 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3369 | /* Promote REQ to ACT */ |
3370 | rq->clock_update_flags <<= 1; | ||
3364 | 3371 | ||
3365 | switch_count = &prev->nivcsw; | 3372 | switch_count = &prev->nivcsw; |
3366 | if (!preempt && prev->state) { | 3373 | if (!preempt && prev->state) { |
@@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt) | |||
3370 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3377 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
3371 | prev->on_rq = 0; | 3378 | prev->on_rq = 0; |
3372 | 3379 | ||
3380 | if (prev->in_iowait) { | ||
3381 | atomic_inc(&rq->nr_iowait); | ||
3382 | delayacct_blkio_start(); | ||
3383 | } | ||
3384 | |||
3373 | /* | 3385 | /* |
3374 | * If a worker went to sleep, notify and ask workqueue | 3386 | * If a worker went to sleep, notify and ask workqueue |
3375 | * whether it wants to wake up a task to maintain | 3387 | * whether it wants to wake up a task to maintain |
@@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3380 | 3392 | ||
3381 | to_wakeup = wq_worker_sleeping(prev); | 3393 | to_wakeup = wq_worker_sleeping(prev); |
3382 | if (to_wakeup) | 3394 | if (to_wakeup) |
3383 | try_to_wake_up_local(to_wakeup, cookie); | 3395 | try_to_wake_up_local(to_wakeup, &rf); |
3384 | } | 3396 | } |
3385 | } | 3397 | } |
3386 | switch_count = &prev->nvcsw; | 3398 | switch_count = &prev->nvcsw; |
@@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt) | |||
3389 | if (task_on_rq_queued(prev)) | 3401 | if (task_on_rq_queued(prev)) |
3390 | update_rq_clock(rq); | 3402 | update_rq_clock(rq); |
3391 | 3403 | ||
3392 | next = pick_next_task(rq, prev, cookie); | 3404 | next = pick_next_task(rq, prev, &rf); |
3393 | clear_tsk_need_resched(prev); | 3405 | clear_tsk_need_resched(prev); |
3394 | clear_preempt_need_resched(); | 3406 | clear_preempt_need_resched(); |
3395 | rq->clock_skip_update = 0; | ||
3396 | 3407 | ||
3397 | if (likely(prev != next)) { | 3408 | if (likely(prev != next)) { |
3398 | rq->nr_switches++; | 3409 | rq->nr_switches++; |
@@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt) | |||
3400 | ++*switch_count; | 3411 | ++*switch_count; |
3401 | 3412 | ||
3402 | trace_sched_switch(preempt, prev, next); | 3413 | trace_sched_switch(preempt, prev, next); |
3403 | rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ | 3414 | |
3415 | /* Also unlocks the rq: */ | ||
3416 | rq = context_switch(rq, prev, next, &rf); | ||
3404 | } else { | 3417 | } else { |
3405 | lockdep_unpin_lock(&rq->lock, cookie); | 3418 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); |
3419 | rq_unpin_lock(rq, &rf); | ||
3406 | raw_spin_unlock_irq(&rq->lock); | 3420 | raw_spin_unlock_irq(&rq->lock); |
3407 | } | 3421 | } |
3408 | 3422 | ||
@@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void) | |||
3426 | smp_mb(); | 3440 | smp_mb(); |
3427 | raw_spin_unlock_wait(¤t->pi_lock); | 3441 | raw_spin_unlock_wait(¤t->pi_lock); |
3428 | 3442 | ||
3429 | /* causes final put_task_struct in finish_task_switch(). */ | 3443 | /* Causes final put_task_struct in finish_task_switch(): */ |
3430 | __set_current_state(TASK_DEAD); | 3444 | __set_current_state(TASK_DEAD); |
3431 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | 3445 | |
3446 | /* Tell freezer to ignore us: */ | ||
3447 | current->flags |= PF_NOFREEZE; | ||
3448 | |||
3432 | __schedule(false); | 3449 | __schedule(false); |
3433 | BUG(); | 3450 | BUG(); |
3434 | /* Avoid "noreturn function does return". */ | 3451 | |
3452 | /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ | ||
3435 | for (;;) | 3453 | for (;;) |
3436 | cpu_relax(); /* For when BUG is null */ | 3454 | cpu_relax(); |
3437 | } | 3455 | } |
3438 | 3456 | ||
3439 | static inline void sched_submit_work(struct task_struct *tsk) | 3457 | static inline void sched_submit_work(struct task_struct *tsk) |
@@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3651 | BUG_ON(prio > MAX_PRIO); | 3669 | BUG_ON(prio > MAX_PRIO); |
3652 | 3670 | ||
3653 | rq = __task_rq_lock(p, &rf); | 3671 | rq = __task_rq_lock(p, &rf); |
3672 | update_rq_clock(rq); | ||
3654 | 3673 | ||
3655 | /* | 3674 | /* |
3656 | * Idle task boosting is a nono in general. There is one | 3675 | * Idle task boosting is a nono in general. There is one |
@@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3725 | 3744 | ||
3726 | check_class_changed(rq, p, prev_class, oldprio); | 3745 | check_class_changed(rq, p, prev_class, oldprio); |
3727 | out_unlock: | 3746 | out_unlock: |
3728 | preempt_disable(); /* avoid rq from going away on us */ | 3747 | /* Avoid rq from going away on us: */ |
3748 | preempt_disable(); | ||
3729 | __task_rq_unlock(rq, &rf); | 3749 | __task_rq_unlock(rq, &rf); |
3730 | 3750 | ||
3731 | balance_callback(rq); | 3751 | balance_callback(rq); |
@@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3747 | * the task might be in the middle of scheduling on another CPU. | 3767 | * the task might be in the middle of scheduling on another CPU. |
3748 | */ | 3768 | */ |
3749 | rq = task_rq_lock(p, &rf); | 3769 | rq = task_rq_lock(p, &rf); |
3770 | update_rq_clock(rq); | ||
3771 | |||
3750 | /* | 3772 | /* |
3751 | * The RT priorities are set via sched_setscheduler(), but we still | 3773 | * The RT priorities are set via sched_setscheduler(), but we still |
3752 | * allow the 'normal' nice value to be set - but as expected | 3774 | * allow the 'normal' nice value to be set - but as expected |
@@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice); | |||
3793 | */ | 3815 | */ |
3794 | int can_nice(const struct task_struct *p, const int nice) | 3816 | int can_nice(const struct task_struct *p, const int nice) |
3795 | { | 3817 | { |
3796 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3818 | /* Convert nice value [19,-20] to rlimit style value [1,40]: */ |
3797 | int nice_rlim = nice_to_rlimit(nice); | 3819 | int nice_rlim = nice_to_rlimit(nice); |
3798 | 3820 | ||
3799 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 3821 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
@@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p) | |||
3849 | } | 3871 | } |
3850 | 3872 | ||
3851 | /** | 3873 | /** |
3852 | * idle_cpu - is a given cpu idle currently? | 3874 | * idle_cpu - is a given CPU idle currently? |
3853 | * @cpu: the processor in question. | 3875 | * @cpu: the processor in question. |
3854 | * | 3876 | * |
3855 | * Return: 1 if the CPU is currently idle. 0 otherwise. | 3877 | * Return: 1 if the CPU is currently idle. 0 otherwise. |
@@ -3873,10 +3895,10 @@ int idle_cpu(int cpu) | |||
3873 | } | 3895 | } |
3874 | 3896 | ||
3875 | /** | 3897 | /** |
3876 | * idle_task - return the idle task for a given cpu. | 3898 | * idle_task - return the idle task for a given CPU. |
3877 | * @cpu: the processor in question. | 3899 | * @cpu: the processor in question. |
3878 | * | 3900 | * |
3879 | * Return: The idle task for the cpu @cpu. | 3901 | * Return: The idle task for the CPU @cpu. |
3880 | */ | 3902 | */ |
3881 | struct task_struct *idle_task(int cpu) | 3903 | struct task_struct *idle_task(int cpu) |
3882 | { | 3904 | { |
@@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr) | |||
4042 | } | 4064 | } |
4043 | 4065 | ||
4044 | /* | 4066 | /* |
4045 | * check the target process has a UID that matches the current process's | 4067 | * Check the target process has a UID that matches the current process's: |
4046 | */ | 4068 | */ |
4047 | static bool check_same_owner(struct task_struct *p) | 4069 | static bool check_same_owner(struct task_struct *p) |
4048 | { | 4070 | { |
@@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4057 | return match; | 4079 | return match; |
4058 | } | 4080 | } |
4059 | 4081 | ||
4060 | static bool dl_param_changed(struct task_struct *p, | 4082 | static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
4061 | const struct sched_attr *attr) | ||
4062 | { | 4083 | { |
4063 | struct sched_dl_entity *dl_se = &p->dl; | 4084 | struct sched_dl_entity *dl_se = &p->dl; |
4064 | 4085 | ||
@@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p, | |||
4085 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | 4106 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; |
4086 | struct rq *rq; | 4107 | struct rq *rq; |
4087 | 4108 | ||
4088 | /* may grab non-irq protected spin_locks */ | 4109 | /* May grab non-irq protected spin_locks: */ |
4089 | BUG_ON(in_interrupt()); | 4110 | BUG_ON(in_interrupt()); |
4090 | recheck: | 4111 | recheck: |
4091 | /* double check policy once rq lock held */ | 4112 | /* Double check policy once rq lock held: */ |
4092 | if (policy < 0) { | 4113 | if (policy < 0) { |
4093 | reset_on_fork = p->sched_reset_on_fork; | 4114 | reset_on_fork = p->sched_reset_on_fork; |
4094 | policy = oldpolicy = p->policy; | 4115 | policy = oldpolicy = p->policy; |
@@ -4128,11 +4149,11 @@ recheck: | |||
4128 | unsigned long rlim_rtprio = | 4149 | unsigned long rlim_rtprio = |
4129 | task_rlimit(p, RLIMIT_RTPRIO); | 4150 | task_rlimit(p, RLIMIT_RTPRIO); |
4130 | 4151 | ||
4131 | /* can't set/change the rt policy */ | 4152 | /* Can't set/change the rt policy: */ |
4132 | if (policy != p->policy && !rlim_rtprio) | 4153 | if (policy != p->policy && !rlim_rtprio) |
4133 | return -EPERM; | 4154 | return -EPERM; |
4134 | 4155 | ||
4135 | /* can't increase priority */ | 4156 | /* Can't increase priority: */ |
4136 | if (attr->sched_priority > p->rt_priority && | 4157 | if (attr->sched_priority > p->rt_priority && |
4137 | attr->sched_priority > rlim_rtprio) | 4158 | attr->sched_priority > rlim_rtprio) |
4138 | return -EPERM; | 4159 | return -EPERM; |
@@ -4156,11 +4177,11 @@ recheck: | |||
4156 | return -EPERM; | 4177 | return -EPERM; |
4157 | } | 4178 | } |
4158 | 4179 | ||
4159 | /* can't change other user's priorities */ | 4180 | /* Can't change other user's priorities: */ |
4160 | if (!check_same_owner(p)) | 4181 | if (!check_same_owner(p)) |
4161 | return -EPERM; | 4182 | return -EPERM; |
4162 | 4183 | ||
4163 | /* Normal users shall not reset the sched_reset_on_fork flag */ | 4184 | /* Normal users shall not reset the sched_reset_on_fork flag: */ |
4164 | if (p->sched_reset_on_fork && !reset_on_fork) | 4185 | if (p->sched_reset_on_fork && !reset_on_fork) |
4165 | return -EPERM; | 4186 | return -EPERM; |
4166 | } | 4187 | } |
@@ -4172,16 +4193,17 @@ recheck: | |||
4172 | } | 4193 | } |
4173 | 4194 | ||
4174 | /* | 4195 | /* |
4175 | * make sure no PI-waiters arrive (or leave) while we are | 4196 | * Make sure no PI-waiters arrive (or leave) while we are |
4176 | * changing the priority of the task: | 4197 | * changing the priority of the task: |
4177 | * | 4198 | * |
4178 | * To be able to change p->policy safely, the appropriate | 4199 | * To be able to change p->policy safely, the appropriate |
4179 | * runqueue lock must be held. | 4200 | * runqueue lock must be held. |
4180 | */ | 4201 | */ |
4181 | rq = task_rq_lock(p, &rf); | 4202 | rq = task_rq_lock(p, &rf); |
4203 | update_rq_clock(rq); | ||
4182 | 4204 | ||
4183 | /* | 4205 | /* |
4184 | * Changing the policy of the stop threads its a very bad idea | 4206 | * Changing the policy of the stop threads its a very bad idea: |
4185 | */ | 4207 | */ |
4186 | if (p == rq->stop) { | 4208 | if (p == rq->stop) { |
4187 | task_rq_unlock(rq, p, &rf); | 4209 | task_rq_unlock(rq, p, &rf); |
@@ -4237,7 +4259,7 @@ change: | |||
4237 | #endif | 4259 | #endif |
4238 | } | 4260 | } |
4239 | 4261 | ||
4240 | /* recheck policy now with rq lock held */ | 4262 | /* Re-check policy now with rq lock held: */ |
4241 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4263 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4242 | policy = oldpolicy = -1; | 4264 | policy = oldpolicy = -1; |
4243 | task_rq_unlock(rq, p, &rf); | 4265 | task_rq_unlock(rq, p, &rf); |
@@ -4294,15 +4316,15 @@ change: | |||
4294 | set_curr_task(rq, p); | 4316 | set_curr_task(rq, p); |
4295 | 4317 | ||
4296 | check_class_changed(rq, p, prev_class, oldprio); | 4318 | check_class_changed(rq, p, prev_class, oldprio); |
4297 | preempt_disable(); /* avoid rq from going away on us */ | 4319 | |
4320 | /* Avoid rq from going away on us: */ | ||
4321 | preempt_disable(); | ||
4298 | task_rq_unlock(rq, p, &rf); | 4322 | task_rq_unlock(rq, p, &rf); |
4299 | 4323 | ||
4300 | if (pi) | 4324 | if (pi) |
4301 | rt_mutex_adjust_pi(p); | 4325 | rt_mutex_adjust_pi(p); |
4302 | 4326 | ||
4303 | /* | 4327 | /* Run balance callbacks after we've adjusted the PI chain: */ |
4304 | * Run balance callbacks after we've adjusted the PI chain. | ||
4305 | */ | ||
4306 | balance_callback(rq); | 4328 | balance_callback(rq); |
4307 | preempt_enable(); | 4329 | preempt_enable(); |
4308 | 4330 | ||
@@ -4395,8 +4417,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
4395 | /* | 4417 | /* |
4396 | * Mimics kernel/events/core.c perf_copy_attr(). | 4418 | * Mimics kernel/events/core.c perf_copy_attr(). |
4397 | */ | 4419 | */ |
4398 | static int sched_copy_attr(struct sched_attr __user *uattr, | 4420 | static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) |
4399 | struct sched_attr *attr) | ||
4400 | { | 4421 | { |
4401 | u32 size; | 4422 | u32 size; |
4402 | int ret; | 4423 | int ret; |
@@ -4404,19 +4425,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4404 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | 4425 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) |
4405 | return -EFAULT; | 4426 | return -EFAULT; |
4406 | 4427 | ||
4407 | /* | 4428 | /* Zero the full structure, so that a short copy will be nice: */ |
4408 | * zero the full structure, so that a short copy will be nice. | ||
4409 | */ | ||
4410 | memset(attr, 0, sizeof(*attr)); | 4429 | memset(attr, 0, sizeof(*attr)); |
4411 | 4430 | ||
4412 | ret = get_user(size, &uattr->size); | 4431 | ret = get_user(size, &uattr->size); |
4413 | if (ret) | 4432 | if (ret) |
4414 | return ret; | 4433 | return ret; |
4415 | 4434 | ||
4416 | if (size > PAGE_SIZE) /* silly large */ | 4435 | /* Bail out on silly large: */ |
4436 | if (size > PAGE_SIZE) | ||
4417 | goto err_size; | 4437 | goto err_size; |
4418 | 4438 | ||
4419 | if (!size) /* abi compat */ | 4439 | /* ABI compatibility quirk: */ |
4440 | if (!size) | ||
4420 | size = SCHED_ATTR_SIZE_VER0; | 4441 | size = SCHED_ATTR_SIZE_VER0; |
4421 | 4442 | ||
4422 | if (size < SCHED_ATTR_SIZE_VER0) | 4443 | if (size < SCHED_ATTR_SIZE_VER0) |
@@ -4451,7 +4472,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4451 | return -EFAULT; | 4472 | return -EFAULT; |
4452 | 4473 | ||
4453 | /* | 4474 | /* |
4454 | * XXX: do we want to be lenient like existing syscalls; or do we want | 4475 | * XXX: Do we want to be lenient like existing syscalls; or do we want |
4455 | * to be strict and return an error on out-of-bounds values? | 4476 | * to be strict and return an error on out-of-bounds values? |
4456 | */ | 4477 | */ |
4457 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); | 4478 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
@@ -4471,10 +4492,8 @@ err_size: | |||
4471 | * | 4492 | * |
4472 | * Return: 0 on success. An error code otherwise. | 4493 | * Return: 0 on success. An error code otherwise. |
4473 | */ | 4494 | */ |
4474 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 4495 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) |
4475 | struct sched_param __user *, param) | ||
4476 | { | 4496 | { |
4477 | /* negative values for policy are not valid */ | ||
4478 | if (policy < 0) | 4497 | if (policy < 0) |
4479 | return -EINVAL; | 4498 | return -EINVAL; |
4480 | 4499 | ||
@@ -4784,10 +4803,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
4784 | } | 4803 | } |
4785 | 4804 | ||
4786 | /** | 4805 | /** |
4787 | * sys_sched_setaffinity - set the cpu affinity of a process | 4806 | * sys_sched_setaffinity - set the CPU affinity of a process |
4788 | * @pid: pid of the process | 4807 | * @pid: pid of the process |
4789 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4808 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4790 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4809 | * @user_mask_ptr: user-space pointer to the new CPU mask |
4791 | * | 4810 | * |
4792 | * Return: 0 on success. An error code otherwise. | 4811 | * Return: 0 on success. An error code otherwise. |
4793 | */ | 4812 | */ |
@@ -4835,10 +4854,10 @@ out_unlock: | |||
4835 | } | 4854 | } |
4836 | 4855 | ||
4837 | /** | 4856 | /** |
4838 | * sys_sched_getaffinity - get the cpu affinity of a process | 4857 | * sys_sched_getaffinity - get the CPU affinity of a process |
4839 | * @pid: pid of the process | 4858 | * @pid: pid of the process |
4840 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4859 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4841 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4860 | * @user_mask_ptr: user-space pointer to hold the current CPU mask |
4842 | * | 4861 | * |
4843 | * Return: size of CPU mask copied to user_mask_ptr on success. An | 4862 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
4844 | * error code otherwise. | 4863 | * error code otherwise. |
@@ -4966,7 +4985,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4966 | * Typical broken usage is: | 4985 | * Typical broken usage is: |
4967 | * | 4986 | * |
4968 | * while (!event) | 4987 | * while (!event) |
4969 | * yield(); | 4988 | * yield(); |
4970 | * | 4989 | * |
4971 | * where one assumes that yield() will let 'the other' process run that will | 4990 | * where one assumes that yield() will let 'the other' process run that will |
4972 | * make event true. If the current task is a SCHED_FIFO task that will never | 4991 | * make event true. If the current task is a SCHED_FIFO task that will never |
@@ -5057,31 +5076,48 @@ out_irq: | |||
5057 | } | 5076 | } |
5058 | EXPORT_SYMBOL_GPL(yield_to); | 5077 | EXPORT_SYMBOL_GPL(yield_to); |
5059 | 5078 | ||
5079 | int io_schedule_prepare(void) | ||
5080 | { | ||
5081 | int old_iowait = current->in_iowait; | ||
5082 | |||
5083 | current->in_iowait = 1; | ||
5084 | blk_schedule_flush_plug(current); | ||
5085 | |||
5086 | return old_iowait; | ||
5087 | } | ||
5088 | |||
5089 | void io_schedule_finish(int token) | ||
5090 | { | ||
5091 | current->in_iowait = token; | ||
5092 | } | ||
5093 | |||
5060 | /* | 5094 | /* |
5061 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5095 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5062 | * that process accounting knows that this is a task in IO wait state. | 5096 | * that process accounting knows that this is a task in IO wait state. |
5063 | */ | 5097 | */ |
5064 | long __sched io_schedule_timeout(long timeout) | 5098 | long __sched io_schedule_timeout(long timeout) |
5065 | { | 5099 | { |
5066 | int old_iowait = current->in_iowait; | 5100 | int token; |
5067 | struct rq *rq; | ||
5068 | long ret; | 5101 | long ret; |
5069 | 5102 | ||
5070 | current->in_iowait = 1; | 5103 | token = io_schedule_prepare(); |
5071 | blk_schedule_flush_plug(current); | ||
5072 | |||
5073 | delayacct_blkio_start(); | ||
5074 | rq = raw_rq(); | ||
5075 | atomic_inc(&rq->nr_iowait); | ||
5076 | ret = schedule_timeout(timeout); | 5104 | ret = schedule_timeout(timeout); |
5077 | current->in_iowait = old_iowait; | 5105 | io_schedule_finish(token); |
5078 | atomic_dec(&rq->nr_iowait); | ||
5079 | delayacct_blkio_end(); | ||
5080 | 5106 | ||
5081 | return ret; | 5107 | return ret; |
5082 | } | 5108 | } |
5083 | EXPORT_SYMBOL(io_schedule_timeout); | 5109 | EXPORT_SYMBOL(io_schedule_timeout); |
5084 | 5110 | ||
5111 | void io_schedule(void) | ||
5112 | { | ||
5113 | int token; | ||
5114 | |||
5115 | token = io_schedule_prepare(); | ||
5116 | schedule(); | ||
5117 | io_schedule_finish(token); | ||
5118 | } | ||
5119 | EXPORT_SYMBOL(io_schedule); | ||
5120 | |||
5085 | /** | 5121 | /** |
5086 | * sys_sched_get_priority_max - return maximum RT priority. | 5122 | * sys_sched_get_priority_max - return maximum RT priority. |
5087 | * @policy: scheduling class. | 5123 | * @policy: scheduling class. |
@@ -5264,7 +5300,7 @@ void init_idle_bootup_task(struct task_struct *idle) | |||
5264 | /** | 5300 | /** |
5265 | * init_idle - set up an idle thread for a given CPU | 5301 | * init_idle - set up an idle thread for a given CPU |
5266 | * @idle: task in question | 5302 | * @idle: task in question |
5267 | * @cpu: cpu the idle task belongs to | 5303 | * @cpu: CPU the idle task belongs to |
5268 | * | 5304 | * |
5269 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 5305 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
5270 | * flag, to make booting more robust. | 5306 | * flag, to make booting more robust. |
@@ -5295,7 +5331,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
5295 | #endif | 5331 | #endif |
5296 | /* | 5332 | /* |
5297 | * We're having a chicken and egg problem, even though we are | 5333 | * We're having a chicken and egg problem, even though we are |
5298 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5334 | * holding rq->lock, the CPU isn't yet set to this CPU so the |
5299 | * lockdep check in task_group() will fail. | 5335 | * lockdep check in task_group() will fail. |
5300 | * | 5336 | * |
5301 | * Similar case to sched_fork(). / Alternatively we could | 5337 | * Similar case to sched_fork(). / Alternatively we could |
@@ -5360,7 +5396,7 @@ int task_can_attach(struct task_struct *p, | |||
5360 | 5396 | ||
5361 | /* | 5397 | /* |
5362 | * Kthreads which disallow setaffinity shouldn't be moved | 5398 | * Kthreads which disallow setaffinity shouldn't be moved |
5363 | * to a new cpuset; we don't want to change their cpu | 5399 | * to a new cpuset; we don't want to change their CPU |
5364 | * affinity and isolating such threads by their set of | 5400 | * affinity and isolating such threads by their set of |
5365 | * allowed nodes is unnecessary. Thus, cpusets are not | 5401 | * allowed nodes is unnecessary. Thus, cpusets are not |
5366 | * applicable for such threads. This prevents checking for | 5402 | * applicable for such threads. This prevents checking for |
@@ -5409,7 +5445,7 @@ out: | |||
5409 | 5445 | ||
5410 | #ifdef CONFIG_SMP | 5446 | #ifdef CONFIG_SMP |
5411 | 5447 | ||
5412 | static bool sched_smp_initialized __read_mostly; | 5448 | bool sched_smp_initialized __read_mostly; |
5413 | 5449 | ||
5414 | #ifdef CONFIG_NUMA_BALANCING | 5450 | #ifdef CONFIG_NUMA_BALANCING |
5415 | /* Migrate current task p to target_cpu */ | 5451 | /* Migrate current task p to target_cpu */ |
@@ -5461,7 +5497,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5461 | 5497 | ||
5462 | #ifdef CONFIG_HOTPLUG_CPU | 5498 | #ifdef CONFIG_HOTPLUG_CPU |
5463 | /* | 5499 | /* |
5464 | * Ensures that the idle task is using init_mm right before its cpu goes | 5500 | * Ensure that the idle task is using init_mm right before its CPU goes |
5465 | * offline. | 5501 | * offline. |
5466 | */ | 5502 | */ |
5467 | void idle_task_exit(void) | 5503 | void idle_task_exit(void) |
@@ -5521,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5521 | { | 5557 | { |
5522 | struct rq *rq = dead_rq; | 5558 | struct rq *rq = dead_rq; |
5523 | struct task_struct *next, *stop = rq->stop; | 5559 | struct task_struct *next, *stop = rq->stop; |
5524 | struct pin_cookie cookie; | 5560 | struct rq_flags rf, old_rf; |
5525 | int dest_cpu; | 5561 | int dest_cpu; |
5526 | 5562 | ||
5527 | /* | 5563 | /* |
@@ -5545,16 +5581,16 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5545 | for (;;) { | 5581 | for (;;) { |
5546 | /* | 5582 | /* |
5547 | * There's this thread running, bail when that's the only | 5583 | * There's this thread running, bail when that's the only |
5548 | * remaining thread. | 5584 | * remaining thread: |
5549 | */ | 5585 | */ |
5550 | if (rq->nr_running == 1) | 5586 | if (rq->nr_running == 1) |
5551 | break; | 5587 | break; |
5552 | 5588 | ||
5553 | /* | 5589 | /* |
5554 | * pick_next_task assumes pinned rq->lock. | 5590 | * pick_next_task() assumes pinned rq->lock: |
5555 | */ | 5591 | */ |
5556 | cookie = lockdep_pin_lock(&rq->lock); | 5592 | rq_pin_lock(rq, &rf); |
5557 | next = pick_next_task(rq, &fake_task, cookie); | 5593 | next = pick_next_task(rq, &fake_task, &rf); |
5558 | BUG_ON(!next); | 5594 | BUG_ON(!next); |
5559 | next->sched_class->put_prev_task(rq, next); | 5595 | next->sched_class->put_prev_task(rq, next); |
5560 | 5596 | ||
@@ -5567,7 +5603,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5567 | * because !cpu_active at this point, which means load-balance | 5603 | * because !cpu_active at this point, which means load-balance |
5568 | * will not interfere. Also, stop-machine. | 5604 | * will not interfere. Also, stop-machine. |
5569 | */ | 5605 | */ |
5570 | lockdep_unpin_lock(&rq->lock, cookie); | 5606 | rq_unpin_lock(rq, &rf); |
5571 | raw_spin_unlock(&rq->lock); | 5607 | raw_spin_unlock(&rq->lock); |
5572 | raw_spin_lock(&next->pi_lock); | 5608 | raw_spin_lock(&next->pi_lock); |
5573 | raw_spin_lock(&rq->lock); | 5609 | raw_spin_lock(&rq->lock); |
@@ -5582,6 +5618,13 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5582 | continue; | 5618 | continue; |
5583 | } | 5619 | } |
5584 | 5620 | ||
5621 | /* | ||
5622 | * __migrate_task() may return with a different | ||
5623 | * rq->lock held and a new cookie in 'rf', but we need | ||
5624 | * to preserve rf::clock_update_flags for 'dead_rq'. | ||
5625 | */ | ||
5626 | old_rf = rf; | ||
5627 | |||
5585 | /* Find suitable destination for @next, with force if needed. */ | 5628 | /* Find suitable destination for @next, with force if needed. */ |
5586 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); | 5629 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); |
5587 | 5630 | ||
@@ -5590,6 +5633,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5590 | raw_spin_unlock(&rq->lock); | 5633 | raw_spin_unlock(&rq->lock); |
5591 | rq = dead_rq; | 5634 | rq = dead_rq; |
5592 | raw_spin_lock(&rq->lock); | 5635 | raw_spin_lock(&rq->lock); |
5636 | rf = old_rf; | ||
5593 | } | 5637 | } |
5594 | raw_spin_unlock(&next->pi_lock); | 5638 | raw_spin_unlock(&next->pi_lock); |
5595 | } | 5639 | } |
@@ -5598,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5598 | } | 5642 | } |
5599 | #endif /* CONFIG_HOTPLUG_CPU */ | 5643 | #endif /* CONFIG_HOTPLUG_CPU */ |
5600 | 5644 | ||
5601 | static void set_rq_online(struct rq *rq) | 5645 | void set_rq_online(struct rq *rq) |
5602 | { | 5646 | { |
5603 | if (!rq->online) { | 5647 | if (!rq->online) { |
5604 | const struct sched_class *class; | 5648 | const struct sched_class *class; |
@@ -5613,7 +5657,7 @@ static void set_rq_online(struct rq *rq) | |||
5613 | } | 5657 | } |
5614 | } | 5658 | } |
5615 | 5659 | ||
5616 | static void set_rq_offline(struct rq *rq) | 5660 | void set_rq_offline(struct rq *rq) |
5617 | { | 5661 | { |
5618 | if (rq->online) { | 5662 | if (rq->online) { |
5619 | const struct sched_class *class; | 5663 | const struct sched_class *class; |
@@ -5635,1647 +5679,10 @@ static void set_cpu_rq_start_time(unsigned int cpu) | |||
5635 | rq->age_stamp = sched_clock_cpu(cpu); | 5679 | rq->age_stamp = sched_clock_cpu(cpu); |
5636 | } | 5680 | } |
5637 | 5681 | ||
5638 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
5639 | |||
5640 | #ifdef CONFIG_SCHED_DEBUG | ||
5641 | |||
5642 | static __read_mostly int sched_debug_enabled; | ||
5643 | |||
5644 | static int __init sched_debug_setup(char *str) | ||
5645 | { | ||
5646 | sched_debug_enabled = 1; | ||
5647 | |||
5648 | return 0; | ||
5649 | } | ||
5650 | early_param("sched_debug", sched_debug_setup); | ||
5651 | |||
5652 | static inline bool sched_debug(void) | ||
5653 | { | ||
5654 | return sched_debug_enabled; | ||
5655 | } | ||
5656 | |||
5657 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
5658 | struct cpumask *groupmask) | ||
5659 | { | ||
5660 | struct sched_group *group = sd->groups; | ||
5661 | |||
5662 | cpumask_clear(groupmask); | ||
5663 | |||
5664 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
5665 | |||
5666 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
5667 | printk("does not load-balance\n"); | ||
5668 | if (sd->parent) | ||
5669 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
5670 | " has parent"); | ||
5671 | return -1; | ||
5672 | } | ||
5673 | |||
5674 | printk(KERN_CONT "span %*pbl level %s\n", | ||
5675 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
5676 | |||
5677 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
5678 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
5679 | "CPU%d\n", cpu); | ||
5680 | } | ||
5681 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
5682 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
5683 | " CPU%d\n", cpu); | ||
5684 | } | ||
5685 | |||
5686 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
5687 | do { | ||
5688 | if (!group) { | ||
5689 | printk("\n"); | ||
5690 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
5691 | break; | ||
5692 | } | ||
5693 | |||
5694 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
5695 | printk(KERN_CONT "\n"); | ||
5696 | printk(KERN_ERR "ERROR: empty group\n"); | ||
5697 | break; | ||
5698 | } | ||
5699 | |||
5700 | if (!(sd->flags & SD_OVERLAP) && | ||
5701 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5702 | printk(KERN_CONT "\n"); | ||
5703 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
5704 | break; | ||
5705 | } | ||
5706 | |||
5707 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
5708 | |||
5709 | printk(KERN_CONT " %*pbl", | ||
5710 | cpumask_pr_args(sched_group_cpus(group))); | ||
5711 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
5712 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
5713 | group->sgc->capacity); | ||
5714 | } | ||
5715 | |||
5716 | group = group->next; | ||
5717 | } while (group != sd->groups); | ||
5718 | printk(KERN_CONT "\n"); | ||
5719 | |||
5720 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
5721 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
5722 | |||
5723 | if (sd->parent && | ||
5724 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
5725 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
5726 | "of domain->span\n"); | ||
5727 | return 0; | ||
5728 | } | ||
5729 | |||
5730 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
5731 | { | ||
5732 | int level = 0; | ||
5733 | |||
5734 | if (!sched_debug_enabled) | ||
5735 | return; | ||
5736 | |||
5737 | if (!sd) { | ||
5738 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
5739 | return; | ||
5740 | } | ||
5741 | |||
5742 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
5743 | |||
5744 | for (;;) { | ||
5745 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
5746 | break; | ||
5747 | level++; | ||
5748 | sd = sd->parent; | ||
5749 | if (!sd) | ||
5750 | break; | ||
5751 | } | ||
5752 | } | ||
5753 | #else /* !CONFIG_SCHED_DEBUG */ | ||
5754 | |||
5755 | # define sched_debug_enabled 0 | ||
5756 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
5757 | static inline bool sched_debug(void) | ||
5758 | { | ||
5759 | return false; | ||
5760 | } | ||
5761 | #endif /* CONFIG_SCHED_DEBUG */ | ||
5762 | |||
5763 | static int sd_degenerate(struct sched_domain *sd) | ||
5764 | { | ||
5765 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
5766 | return 1; | ||
5767 | |||
5768 | /* Following flags need at least 2 groups */ | ||
5769 | if (sd->flags & (SD_LOAD_BALANCE | | ||
5770 | SD_BALANCE_NEWIDLE | | ||
5771 | SD_BALANCE_FORK | | ||
5772 | SD_BALANCE_EXEC | | ||
5773 | SD_SHARE_CPUCAPACITY | | ||
5774 | SD_ASYM_CPUCAPACITY | | ||
5775 | SD_SHARE_PKG_RESOURCES | | ||
5776 | SD_SHARE_POWERDOMAIN)) { | ||
5777 | if (sd->groups != sd->groups->next) | ||
5778 | return 0; | ||
5779 | } | ||
5780 | |||
5781 | /* Following flags don't use groups */ | ||
5782 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
5783 | return 0; | ||
5784 | |||
5785 | return 1; | ||
5786 | } | ||
5787 | |||
5788 | static int | ||
5789 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
5790 | { | ||
5791 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
5792 | |||
5793 | if (sd_degenerate(parent)) | ||
5794 | return 1; | ||
5795 | |||
5796 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
5797 | return 0; | ||
5798 | |||
5799 | /* Flags needing groups don't count if only 1 group in parent */ | ||
5800 | if (parent->groups == parent->groups->next) { | ||
5801 | pflags &= ~(SD_LOAD_BALANCE | | ||
5802 | SD_BALANCE_NEWIDLE | | ||
5803 | SD_BALANCE_FORK | | ||
5804 | SD_BALANCE_EXEC | | ||
5805 | SD_ASYM_CPUCAPACITY | | ||
5806 | SD_SHARE_CPUCAPACITY | | ||
5807 | SD_SHARE_PKG_RESOURCES | | ||
5808 | SD_PREFER_SIBLING | | ||
5809 | SD_SHARE_POWERDOMAIN); | ||
5810 | if (nr_node_ids == 1) | ||
5811 | pflags &= ~SD_SERIALIZE; | ||
5812 | } | ||
5813 | if (~cflags & pflags) | ||
5814 | return 0; | ||
5815 | |||
5816 | return 1; | ||
5817 | } | ||
5818 | |||
5819 | static void free_rootdomain(struct rcu_head *rcu) | ||
5820 | { | ||
5821 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
5822 | |||
5823 | cpupri_cleanup(&rd->cpupri); | ||
5824 | cpudl_cleanup(&rd->cpudl); | ||
5825 | free_cpumask_var(rd->dlo_mask); | ||
5826 | free_cpumask_var(rd->rto_mask); | ||
5827 | free_cpumask_var(rd->online); | ||
5828 | free_cpumask_var(rd->span); | ||
5829 | kfree(rd); | ||
5830 | } | ||
5831 | |||
5832 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
5833 | { | ||
5834 | struct root_domain *old_rd = NULL; | ||
5835 | unsigned long flags; | ||
5836 | |||
5837 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5838 | |||
5839 | if (rq->rd) { | ||
5840 | old_rd = rq->rd; | ||
5841 | |||
5842 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
5843 | set_rq_offline(rq); | ||
5844 | |||
5845 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
5846 | |||
5847 | /* | ||
5848 | * If we dont want to free the old_rd yet then | ||
5849 | * set old_rd to NULL to skip the freeing later | ||
5850 | * in this function: | ||
5851 | */ | ||
5852 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
5853 | old_rd = NULL; | ||
5854 | } | ||
5855 | |||
5856 | atomic_inc(&rd->refcount); | ||
5857 | rq->rd = rd; | ||
5858 | |||
5859 | cpumask_set_cpu(rq->cpu, rd->span); | ||
5860 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
5861 | set_rq_online(rq); | ||
5862 | |||
5863 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5864 | |||
5865 | if (old_rd) | ||
5866 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
5867 | } | ||
5868 | |||
5869 | static int init_rootdomain(struct root_domain *rd) | ||
5870 | { | ||
5871 | memset(rd, 0, sizeof(*rd)); | ||
5872 | |||
5873 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
5874 | goto out; | ||
5875 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
5876 | goto free_span; | ||
5877 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
5878 | goto free_online; | ||
5879 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5880 | goto free_dlo_mask; | ||
5881 | |||
5882 | init_dl_bw(&rd->dl_bw); | ||
5883 | if (cpudl_init(&rd->cpudl) != 0) | ||
5884 | goto free_dlo_mask; | ||
5885 | |||
5886 | if (cpupri_init(&rd->cpupri) != 0) | ||
5887 | goto free_rto_mask; | ||
5888 | return 0; | ||
5889 | |||
5890 | free_rto_mask: | ||
5891 | free_cpumask_var(rd->rto_mask); | ||
5892 | free_dlo_mask: | ||
5893 | free_cpumask_var(rd->dlo_mask); | ||
5894 | free_online: | ||
5895 | free_cpumask_var(rd->online); | ||
5896 | free_span: | ||
5897 | free_cpumask_var(rd->span); | ||
5898 | out: | ||
5899 | return -ENOMEM; | ||
5900 | } | ||
5901 | |||
5902 | /* | ||
5903 | * By default the system creates a single root-domain with all cpus as | ||
5904 | * members (mimicking the global state we have today). | ||
5905 | */ | ||
5906 | struct root_domain def_root_domain; | ||
5907 | |||
5908 | static void init_defrootdomain(void) | ||
5909 | { | ||
5910 | init_rootdomain(&def_root_domain); | ||
5911 | |||
5912 | atomic_set(&def_root_domain.refcount, 1); | ||
5913 | } | ||
5914 | |||
5915 | static struct root_domain *alloc_rootdomain(void) | ||
5916 | { | ||
5917 | struct root_domain *rd; | ||
5918 | |||
5919 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
5920 | if (!rd) | ||
5921 | return NULL; | ||
5922 | |||
5923 | if (init_rootdomain(rd) != 0) { | ||
5924 | kfree(rd); | ||
5925 | return NULL; | ||
5926 | } | ||
5927 | |||
5928 | return rd; | ||
5929 | } | ||
5930 | |||
5931 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
5932 | { | ||
5933 | struct sched_group *tmp, *first; | ||
5934 | |||
5935 | if (!sg) | ||
5936 | return; | ||
5937 | |||
5938 | first = sg; | ||
5939 | do { | ||
5940 | tmp = sg->next; | ||
5941 | |||
5942 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
5943 | kfree(sg->sgc); | ||
5944 | |||
5945 | kfree(sg); | ||
5946 | sg = tmp; | ||
5947 | } while (sg != first); | ||
5948 | } | ||
5949 | |||
5950 | static void destroy_sched_domain(struct sched_domain *sd) | ||
5951 | { | ||
5952 | /* | ||
5953 | * If its an overlapping domain it has private groups, iterate and | ||
5954 | * nuke them all. | ||
5955 | */ | ||
5956 | if (sd->flags & SD_OVERLAP) { | ||
5957 | free_sched_groups(sd->groups, 1); | ||
5958 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
5959 | kfree(sd->groups->sgc); | ||
5960 | kfree(sd->groups); | ||
5961 | } | ||
5962 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
5963 | kfree(sd->shared); | ||
5964 | kfree(sd); | ||
5965 | } | ||
5966 | |||
5967 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
5968 | { | ||
5969 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
5970 | |||
5971 | while (sd) { | ||
5972 | struct sched_domain *parent = sd->parent; | ||
5973 | destroy_sched_domain(sd); | ||
5974 | sd = parent; | ||
5975 | } | ||
5976 | } | ||
5977 | |||
5978 | static void destroy_sched_domains(struct sched_domain *sd) | ||
5979 | { | ||
5980 | if (sd) | ||
5981 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
5982 | } | ||
5983 | |||
5984 | /* | ||
5985 | * Keep a special pointer to the highest sched_domain that has | ||
5986 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5987 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5988 | * | ||
5989 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5990 | * the cpumask of the domain), this allows us to quickly tell if | ||
5991 | * two cpus are in the same cache domain, see cpus_share_cache(). | ||
5992 | */ | ||
5993 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5994 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5995 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5996 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
5997 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
5998 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
5999 | |||
6000 | static void update_top_cache_domain(int cpu) | ||
6001 | { | ||
6002 | struct sched_domain_shared *sds = NULL; | ||
6003 | struct sched_domain *sd; | ||
6004 | int id = cpu; | ||
6005 | int size = 1; | ||
6006 | |||
6007 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
6008 | if (sd) { | ||
6009 | id = cpumask_first(sched_domain_span(sd)); | ||
6010 | size = cpumask_weight(sched_domain_span(sd)); | ||
6011 | sds = sd->shared; | ||
6012 | } | ||
6013 | |||
6014 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
6015 | per_cpu(sd_llc_size, cpu) = size; | ||
6016 | per_cpu(sd_llc_id, cpu) = id; | ||
6017 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
6018 | |||
6019 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
6020 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
6021 | |||
6022 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
6023 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
6024 | } | ||
6025 | |||
6026 | /* | ||
6027 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
6028 | * hold the hotplug lock. | ||
6029 | */ | ||
6030 | static void | ||
6031 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
6032 | { | ||
6033 | struct rq *rq = cpu_rq(cpu); | ||
6034 | struct sched_domain *tmp; | ||
6035 | |||
6036 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
6037 | for (tmp = sd; tmp; ) { | ||
6038 | struct sched_domain *parent = tmp->parent; | ||
6039 | if (!parent) | ||
6040 | break; | ||
6041 | |||
6042 | if (sd_parent_degenerate(tmp, parent)) { | ||
6043 | tmp->parent = parent->parent; | ||
6044 | if (parent->parent) | ||
6045 | parent->parent->child = tmp; | ||
6046 | /* | ||
6047 | * Transfer SD_PREFER_SIBLING down in case of a | ||
6048 | * degenerate parent; the spans match for this | ||
6049 | * so the property transfers. | ||
6050 | */ | ||
6051 | if (parent->flags & SD_PREFER_SIBLING) | ||
6052 | tmp->flags |= SD_PREFER_SIBLING; | ||
6053 | destroy_sched_domain(parent); | ||
6054 | } else | ||
6055 | tmp = tmp->parent; | ||
6056 | } | ||
6057 | |||
6058 | if (sd && sd_degenerate(sd)) { | ||
6059 | tmp = sd; | ||
6060 | sd = sd->parent; | ||
6061 | destroy_sched_domain(tmp); | ||
6062 | if (sd) | ||
6063 | sd->child = NULL; | ||
6064 | } | ||
6065 | |||
6066 | sched_domain_debug(sd, cpu); | ||
6067 | |||
6068 | rq_attach_root(rq, rd); | ||
6069 | tmp = rq->sd; | ||
6070 | rcu_assign_pointer(rq->sd, sd); | ||
6071 | destroy_sched_domains(tmp); | ||
6072 | |||
6073 | update_top_cache_domain(cpu); | ||
6074 | } | ||
6075 | |||
6076 | /* Setup the mask of cpus configured for isolated domains */ | ||
6077 | static int __init isolated_cpu_setup(char *str) | ||
6078 | { | ||
6079 | int ret; | ||
6080 | |||
6081 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
6082 | ret = cpulist_parse(str, cpu_isolated_map); | ||
6083 | if (ret) { | ||
6084 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
6085 | return 0; | ||
6086 | } | ||
6087 | return 1; | ||
6088 | } | ||
6089 | __setup("isolcpus=", isolated_cpu_setup); | ||
6090 | |||
6091 | struct s_data { | ||
6092 | struct sched_domain ** __percpu sd; | ||
6093 | struct root_domain *rd; | ||
6094 | }; | ||
6095 | |||
6096 | enum s_alloc { | ||
6097 | sa_rootdomain, | ||
6098 | sa_sd, | ||
6099 | sa_sd_storage, | ||
6100 | sa_none, | ||
6101 | }; | ||
6102 | |||
6103 | /* | ||
6104 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6105 | * domain traversal. | ||
6106 | * | ||
6107 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6108 | * unequal depth, make sure to skip domains that already cover the entire | ||
6109 | * range. | ||
6110 | * | ||
6111 | * In that case build_sched_domains() will have terminated the iteration early | ||
6112 | * and our sibling sd spans will be empty. Domains should always include the | ||
6113 | * cpu they're built on, so check that. | ||
6114 | * | ||
6115 | */ | ||
6116 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6117 | { | ||
6118 | const struct cpumask *span = sched_domain_span(sd); | ||
6119 | struct sd_data *sdd = sd->private; | ||
6120 | struct sched_domain *sibling; | ||
6121 | int i; | ||
6122 | |||
6123 | for_each_cpu(i, span) { | ||
6124 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6125 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6126 | continue; | ||
6127 | |||
6128 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6129 | } | ||
6130 | } | ||
6131 | |||
6132 | /* | ||
6133 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6134 | * of this group that's also in the iteration mask. | ||
6135 | */ | ||
6136 | int group_balance_cpu(struct sched_group *sg) | ||
6137 | { | ||
6138 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6139 | } | ||
6140 | |||
6141 | static int | ||
6142 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
6143 | { | ||
6144 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
6145 | const struct cpumask *span = sched_domain_span(sd); | ||
6146 | struct cpumask *covered = sched_domains_tmpmask; | ||
6147 | struct sd_data *sdd = sd->private; | ||
6148 | struct sched_domain *sibling; | ||
6149 | int i; | ||
6150 | |||
6151 | cpumask_clear(covered); | ||
6152 | |||
6153 | for_each_cpu(i, span) { | ||
6154 | struct cpumask *sg_span; | ||
6155 | |||
6156 | if (cpumask_test_cpu(i, covered)) | ||
6157 | continue; | ||
6158 | |||
6159 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6160 | |||
6161 | /* See the comment near build_group_mask(). */ | ||
6162 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6163 | continue; | ||
6164 | |||
6165 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6166 | GFP_KERNEL, cpu_to_node(cpu)); | ||
6167 | |||
6168 | if (!sg) | ||
6169 | goto fail; | ||
6170 | |||
6171 | sg_span = sched_group_cpus(sg); | ||
6172 | if (sibling->child) | ||
6173 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
6174 | else | ||
6175 | cpumask_set_cpu(i, sg_span); | ||
6176 | |||
6177 | cpumask_or(covered, covered, sg_span); | ||
6178 | |||
6179 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
6180 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
6181 | build_group_mask(sd, sg); | ||
6182 | |||
6183 | /* | ||
6184 | * Initialize sgc->capacity such that even if we mess up the | ||
6185 | * domains and no possible iteration will get us here, we won't | ||
6186 | * die on a /0 trap. | ||
6187 | */ | ||
6188 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
6189 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
6190 | |||
6191 | /* | ||
6192 | * Make sure the first group of this domain contains the | ||
6193 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6194 | * breaks. See update_sg_lb_stats(). | ||
6195 | */ | ||
6196 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6197 | group_balance_cpu(sg) == cpu) | ||
6198 | groups = sg; | ||
6199 | |||
6200 | if (!first) | ||
6201 | first = sg; | ||
6202 | if (last) | ||
6203 | last->next = sg; | ||
6204 | last = sg; | ||
6205 | last->next = first; | ||
6206 | } | ||
6207 | sd->groups = groups; | ||
6208 | |||
6209 | return 0; | ||
6210 | |||
6211 | fail: | ||
6212 | free_sched_groups(first, 0); | ||
6213 | |||
6214 | return -ENOMEM; | ||
6215 | } | ||
6216 | |||
6217 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
6218 | { | ||
6219 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6220 | struct sched_domain *child = sd->child; | ||
6221 | |||
6222 | if (child) | ||
6223 | cpu = cpumask_first(sched_domain_span(child)); | ||
6224 | |||
6225 | if (sg) { | ||
6226 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
6227 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
6228 | atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ | ||
6229 | } | ||
6230 | |||
6231 | return cpu; | ||
6232 | } | ||
6233 | |||
6234 | /* | ||
6235 | * build_sched_groups will build a circular linked list of the groups | ||
6236 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6237 | * and ->cpu_capacity to 0. | ||
6238 | * | ||
6239 | * Assumes the sched_domain tree is fully constructed | ||
6240 | */ | ||
6241 | static int | ||
6242 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
6243 | { | ||
6244 | struct sched_group *first = NULL, *last = NULL; | ||
6245 | struct sd_data *sdd = sd->private; | ||
6246 | const struct cpumask *span = sched_domain_span(sd); | ||
6247 | struct cpumask *covered; | ||
6248 | int i; | ||
6249 | |||
6250 | get_group(cpu, sdd, &sd->groups); | ||
6251 | atomic_inc(&sd->groups->ref); | ||
6252 | |||
6253 | if (cpu != cpumask_first(span)) | ||
6254 | return 0; | ||
6255 | |||
6256 | lockdep_assert_held(&sched_domains_mutex); | ||
6257 | covered = sched_domains_tmpmask; | ||
6258 | |||
6259 | cpumask_clear(covered); | ||
6260 | |||
6261 | for_each_cpu(i, span) { | ||
6262 | struct sched_group *sg; | ||
6263 | int group, j; | ||
6264 | |||
6265 | if (cpumask_test_cpu(i, covered)) | ||
6266 | continue; | ||
6267 | |||
6268 | group = get_group(i, sdd, &sg); | ||
6269 | cpumask_setall(sched_group_mask(sg)); | ||
6270 | |||
6271 | for_each_cpu(j, span) { | ||
6272 | if (get_group(j, sdd, NULL) != group) | ||
6273 | continue; | ||
6274 | |||
6275 | cpumask_set_cpu(j, covered); | ||
6276 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6277 | } | ||
6278 | |||
6279 | if (!first) | ||
6280 | first = sg; | ||
6281 | if (last) | ||
6282 | last->next = sg; | ||
6283 | last = sg; | ||
6284 | } | ||
6285 | last->next = first; | ||
6286 | |||
6287 | return 0; | ||
6288 | } | ||
6289 | |||
6290 | /* | ||
6291 | * Initialize sched groups cpu_capacity. | ||
6292 | * | ||
6293 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
6294 | * distributing the load between different sched groups in a sched domain. | ||
6295 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
6296 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
6297 | * group having more cpu_capacity will pickup more load compared to the | ||
6298 | * group having less cpu_capacity. | ||
6299 | */ | ||
6300 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
6301 | { | ||
6302 | struct sched_group *sg = sd->groups; | ||
6303 | |||
6304 | WARN_ON(!sg); | ||
6305 | |||
6306 | do { | ||
6307 | int cpu, max_cpu = -1; | ||
6308 | |||
6309 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
6310 | |||
6311 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
6312 | goto next; | ||
6313 | |||
6314 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
6315 | if (max_cpu < 0) | ||
6316 | max_cpu = cpu; | ||
6317 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
6318 | max_cpu = cpu; | ||
6319 | } | ||
6320 | sg->asym_prefer_cpu = max_cpu; | ||
6321 | |||
6322 | next: | ||
6323 | sg = sg->next; | ||
6324 | } while (sg != sd->groups); | ||
6325 | |||
6326 | if (cpu != group_balance_cpu(sg)) | ||
6327 | return; | ||
6328 | |||
6329 | update_group_capacity(sd, cpu); | ||
6330 | } | ||
6331 | |||
6332 | /* | ||
6333 | * Initializers for schedule domains | ||
6334 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
6335 | */ | ||
6336 | |||
6337 | static int default_relax_domain_level = -1; | ||
6338 | int sched_domain_level_max; | ||
6339 | |||
6340 | static int __init setup_relax_domain_level(char *str) | ||
6341 | { | ||
6342 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
6343 | pr_warn("Unable to set relax_domain_level\n"); | ||
6344 | |||
6345 | return 1; | ||
6346 | } | ||
6347 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
6348 | |||
6349 | static void set_domain_attribute(struct sched_domain *sd, | ||
6350 | struct sched_domain_attr *attr) | ||
6351 | { | ||
6352 | int request; | ||
6353 | |||
6354 | if (!attr || attr->relax_domain_level < 0) { | ||
6355 | if (default_relax_domain_level < 0) | ||
6356 | return; | ||
6357 | else | ||
6358 | request = default_relax_domain_level; | ||
6359 | } else | ||
6360 | request = attr->relax_domain_level; | ||
6361 | if (request < sd->level) { | ||
6362 | /* turn off idle balance on this domain */ | ||
6363 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6364 | } else { | ||
6365 | /* turn on idle balance on this domain */ | ||
6366 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6367 | } | ||
6368 | } | ||
6369 | |||
6370 | static void __sdt_free(const struct cpumask *cpu_map); | ||
6371 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
6372 | |||
6373 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
6374 | const struct cpumask *cpu_map) | ||
6375 | { | ||
6376 | switch (what) { | ||
6377 | case sa_rootdomain: | ||
6378 | if (!atomic_read(&d->rd->refcount)) | ||
6379 | free_rootdomain(&d->rd->rcu); /* fall through */ | ||
6380 | case sa_sd: | ||
6381 | free_percpu(d->sd); /* fall through */ | ||
6382 | case sa_sd_storage: | ||
6383 | __sdt_free(cpu_map); /* fall through */ | ||
6384 | case sa_none: | ||
6385 | break; | ||
6386 | } | ||
6387 | } | ||
6388 | |||
6389 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
6390 | const struct cpumask *cpu_map) | ||
6391 | { | ||
6392 | memset(d, 0, sizeof(*d)); | ||
6393 | |||
6394 | if (__sdt_alloc(cpu_map)) | ||
6395 | return sa_sd_storage; | ||
6396 | d->sd = alloc_percpu(struct sched_domain *); | ||
6397 | if (!d->sd) | ||
6398 | return sa_sd_storage; | ||
6399 | d->rd = alloc_rootdomain(); | ||
6400 | if (!d->rd) | ||
6401 | return sa_sd; | ||
6402 | return sa_rootdomain; | ||
6403 | } | ||
6404 | |||
6405 | /* | ||
6406 | * NULL the sd_data elements we've used to build the sched_domain and | ||
6407 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
6408 | * will not free the data we're using. | ||
6409 | */ | ||
6410 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
6411 | { | ||
6412 | struct sd_data *sdd = sd->private; | ||
6413 | |||
6414 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
6415 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
6416 | |||
6417 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6418 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6419 | |||
6420 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
6421 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
6422 | |||
6423 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
6424 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
6425 | } | ||
6426 | |||
6427 | #ifdef CONFIG_NUMA | ||
6428 | static int sched_domains_numa_levels; | ||
6429 | enum numa_topology_type sched_numa_topology_type; | ||
6430 | static int *sched_domains_numa_distance; | ||
6431 | int sched_max_numa_distance; | ||
6432 | static struct cpumask ***sched_domains_numa_masks; | ||
6433 | static int sched_domains_curr_level; | ||
6434 | #endif | ||
6435 | |||
6436 | /* | ||
6437 | * SD_flags allowed in topology descriptions. | ||
6438 | * | ||
6439 | * These flags are purely descriptive of the topology and do not prescribe | ||
6440 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
6441 | * function: | ||
6442 | * | ||
6443 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6444 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6445 | * SD_NUMA - describes NUMA topologies | ||
6446 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6447 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6448 | * | ||
6449 | * Odd one out, which beside describing the topology has a quirk also | ||
6450 | * prescribes the desired behaviour that goes along with it: | ||
6451 | * | ||
6452 | * SD_ASYM_PACKING - describes SMT quirks | ||
6453 | */ | ||
6454 | #define TOPOLOGY_SD_FLAGS \ | ||
6455 | (SD_SHARE_CPUCAPACITY | \ | ||
6456 | SD_SHARE_PKG_RESOURCES | \ | ||
6457 | SD_NUMA | \ | ||
6458 | SD_ASYM_PACKING | \ | ||
6459 | SD_ASYM_CPUCAPACITY | \ | ||
6460 | SD_SHARE_POWERDOMAIN) | ||
6461 | |||
6462 | static struct sched_domain * | ||
6463 | sd_init(struct sched_domain_topology_level *tl, | ||
6464 | const struct cpumask *cpu_map, | ||
6465 | struct sched_domain *child, int cpu) | ||
6466 | { | ||
6467 | struct sd_data *sdd = &tl->data; | ||
6468 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6469 | int sd_id, sd_weight, sd_flags = 0; | ||
6470 | |||
6471 | #ifdef CONFIG_NUMA | ||
6472 | /* | ||
6473 | * Ugly hack to pass state to sd_numa_mask()... | ||
6474 | */ | ||
6475 | sched_domains_curr_level = tl->numa_level; | ||
6476 | #endif | ||
6477 | |||
6478 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
6479 | |||
6480 | if (tl->sd_flags) | ||
6481 | sd_flags = (*tl->sd_flags)(); | ||
6482 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
6483 | "wrong sd_flags in topology description\n")) | ||
6484 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
6485 | |||
6486 | *sd = (struct sched_domain){ | ||
6487 | .min_interval = sd_weight, | ||
6488 | .max_interval = 2*sd_weight, | ||
6489 | .busy_factor = 32, | ||
6490 | .imbalance_pct = 125, | ||
6491 | |||
6492 | .cache_nice_tries = 0, | ||
6493 | .busy_idx = 0, | ||
6494 | .idle_idx = 0, | ||
6495 | .newidle_idx = 0, | ||
6496 | .wake_idx = 0, | ||
6497 | .forkexec_idx = 0, | ||
6498 | |||
6499 | .flags = 1*SD_LOAD_BALANCE | ||
6500 | | 1*SD_BALANCE_NEWIDLE | ||
6501 | | 1*SD_BALANCE_EXEC | ||
6502 | | 1*SD_BALANCE_FORK | ||
6503 | | 0*SD_BALANCE_WAKE | ||
6504 | | 1*SD_WAKE_AFFINE | ||
6505 | | 0*SD_SHARE_CPUCAPACITY | ||
6506 | | 0*SD_SHARE_PKG_RESOURCES | ||
6507 | | 0*SD_SERIALIZE | ||
6508 | | 0*SD_PREFER_SIBLING | ||
6509 | | 0*SD_NUMA | ||
6510 | | sd_flags | ||
6511 | , | ||
6512 | |||
6513 | .last_balance = jiffies, | ||
6514 | .balance_interval = sd_weight, | ||
6515 | .smt_gain = 0, | ||
6516 | .max_newidle_lb_cost = 0, | ||
6517 | .next_decay_max_lb_cost = jiffies, | ||
6518 | .child = child, | ||
6519 | #ifdef CONFIG_SCHED_DEBUG | ||
6520 | .name = tl->name, | ||
6521 | #endif | ||
6522 | }; | ||
6523 | |||
6524 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6525 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6526 | |||
6527 | /* | ||
6528 | * Convert topological properties into behaviour. | ||
6529 | */ | ||
6530 | |||
6531 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6532 | struct sched_domain *t = sd; | ||
6533 | |||
6534 | for_each_lower_domain(t) | ||
6535 | t->flags |= SD_BALANCE_WAKE; | ||
6536 | } | ||
6537 | |||
6538 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
6539 | sd->flags |= SD_PREFER_SIBLING; | ||
6540 | sd->imbalance_pct = 110; | ||
6541 | sd->smt_gain = 1178; /* ~15% */ | ||
6542 | |||
6543 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6544 | sd->imbalance_pct = 117; | ||
6545 | sd->cache_nice_tries = 1; | ||
6546 | sd->busy_idx = 2; | ||
6547 | |||
6548 | #ifdef CONFIG_NUMA | ||
6549 | } else if (sd->flags & SD_NUMA) { | ||
6550 | sd->cache_nice_tries = 2; | ||
6551 | sd->busy_idx = 3; | ||
6552 | sd->idle_idx = 2; | ||
6553 | |||
6554 | sd->flags |= SD_SERIALIZE; | ||
6555 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
6556 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
6557 | SD_BALANCE_FORK | | ||
6558 | SD_WAKE_AFFINE); | ||
6559 | } | ||
6560 | |||
6561 | #endif | ||
6562 | } else { | ||
6563 | sd->flags |= SD_PREFER_SIBLING; | ||
6564 | sd->cache_nice_tries = 1; | ||
6565 | sd->busy_idx = 2; | ||
6566 | sd->idle_idx = 1; | ||
6567 | } | ||
6568 | |||
6569 | /* | ||
6570 | * For all levels sharing cache; connect a sched_domain_shared | ||
6571 | * instance. | ||
6572 | */ | ||
6573 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6574 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6575 | atomic_inc(&sd->shared->ref); | ||
6576 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6577 | } | ||
6578 | |||
6579 | sd->private = sdd; | ||
6580 | |||
6581 | return sd; | ||
6582 | } | ||
6583 | |||
6584 | /* | ||
6585 | * Topology list, bottom-up. | ||
6586 | */ | ||
6587 | static struct sched_domain_topology_level default_topology[] = { | ||
6588 | #ifdef CONFIG_SCHED_SMT | ||
6589 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
6590 | #endif | ||
6591 | #ifdef CONFIG_SCHED_MC | ||
6592 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
6593 | #endif | ||
6594 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
6595 | { NULL, }, | ||
6596 | }; | ||
6597 | |||
6598 | static struct sched_domain_topology_level *sched_domain_topology = | ||
6599 | default_topology; | ||
6600 | |||
6601 | #define for_each_sd_topology(tl) \ | ||
6602 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
6603 | |||
6604 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
6605 | { | ||
6606 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6607 | return; | ||
6608 | |||
6609 | sched_domain_topology = tl; | ||
6610 | } | ||
6611 | |||
6612 | #ifdef CONFIG_NUMA | ||
6613 | |||
6614 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6615 | { | ||
6616 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6617 | } | ||
6618 | |||
6619 | static void sched_numa_warn(const char *str) | ||
6620 | { | ||
6621 | static int done = false; | ||
6622 | int i,j; | ||
6623 | |||
6624 | if (done) | ||
6625 | return; | ||
6626 | |||
6627 | done = true; | ||
6628 | |||
6629 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6630 | |||
6631 | for (i = 0; i < nr_node_ids; i++) { | ||
6632 | printk(KERN_WARNING " "); | ||
6633 | for (j = 0; j < nr_node_ids; j++) | ||
6634 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6635 | printk(KERN_CONT "\n"); | ||
6636 | } | ||
6637 | printk(KERN_WARNING "\n"); | ||
6638 | } | ||
6639 | |||
6640 | bool find_numa_distance(int distance) | ||
6641 | { | ||
6642 | int i; | ||
6643 | |||
6644 | if (distance == node_distance(0, 0)) | ||
6645 | return true; | ||
6646 | |||
6647 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6648 | if (sched_domains_numa_distance[i] == distance) | ||
6649 | return true; | ||
6650 | } | ||
6651 | |||
6652 | return false; | ||
6653 | } | ||
6654 | |||
6655 | /* | ||
6656 | * A system can have three types of NUMA topology: | ||
6657 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
6658 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
6659 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
6660 | * | ||
6661 | * The difference between a glueless mesh topology and a backplane | ||
6662 | * topology lies in whether communication between not directly | ||
6663 | * connected nodes goes through intermediary nodes (where programs | ||
6664 | * could run), or through backplane controllers. This affects | ||
6665 | * placement of programs. | ||
6666 | * | ||
6667 | * The type of topology can be discerned with the following tests: | ||
6668 | * - If the maximum distance between any nodes is 1 hop, the system | ||
6669 | * is directly connected. | ||
6670 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
6671 | * there is an intermediary node C, which is < N hops away from both | ||
6672 | * nodes A and B, the system is a glueless mesh. | ||
6673 | */ | ||
6674 | static void init_numa_topology_type(void) | ||
6675 | { | ||
6676 | int a, b, c, n; | ||
6677 | |||
6678 | n = sched_max_numa_distance; | ||
6679 | |||
6680 | if (sched_domains_numa_levels <= 1) { | ||
6681 | sched_numa_topology_type = NUMA_DIRECT; | ||
6682 | return; | ||
6683 | } | ||
6684 | |||
6685 | for_each_online_node(a) { | ||
6686 | for_each_online_node(b) { | ||
6687 | /* Find two nodes furthest removed from each other. */ | ||
6688 | if (node_distance(a, b) < n) | ||
6689 | continue; | ||
6690 | |||
6691 | /* Is there an intermediary node between a and b? */ | ||
6692 | for_each_online_node(c) { | ||
6693 | if (node_distance(a, c) < n && | ||
6694 | node_distance(b, c) < n) { | ||
6695 | sched_numa_topology_type = | ||
6696 | NUMA_GLUELESS_MESH; | ||
6697 | return; | ||
6698 | } | ||
6699 | } | ||
6700 | |||
6701 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
6702 | return; | ||
6703 | } | ||
6704 | } | ||
6705 | } | ||
6706 | |||
6707 | static void sched_init_numa(void) | ||
6708 | { | ||
6709 | int next_distance, curr_distance = node_distance(0, 0); | ||
6710 | struct sched_domain_topology_level *tl; | ||
6711 | int level = 0; | ||
6712 | int i, j, k; | ||
6713 | |||
6714 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6715 | if (!sched_domains_numa_distance) | ||
6716 | return; | ||
6717 | |||
6718 | /* | ||
6719 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6720 | * unique distances in the node_distance() table. | ||
6721 | * | ||
6722 | * Assumes node_distance(0,j) includes all distances in | ||
6723 | * node_distance(i,j) in order to avoid cubic time. | ||
6724 | */ | ||
6725 | next_distance = curr_distance; | ||
6726 | for (i = 0; i < nr_node_ids; i++) { | ||
6727 | for (j = 0; j < nr_node_ids; j++) { | ||
6728 | for (k = 0; k < nr_node_ids; k++) { | ||
6729 | int distance = node_distance(i, k); | ||
6730 | |||
6731 | if (distance > curr_distance && | ||
6732 | (distance < next_distance || | ||
6733 | next_distance == curr_distance)) | ||
6734 | next_distance = distance; | ||
6735 | |||
6736 | /* | ||
6737 | * While not a strong assumption it would be nice to know | ||
6738 | * about cases where if node A is connected to B, B is not | ||
6739 | * equally connected to A. | ||
6740 | */ | ||
6741 | if (sched_debug() && node_distance(k, i) != distance) | ||
6742 | sched_numa_warn("Node-distance not symmetric"); | ||
6743 | |||
6744 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6745 | sched_numa_warn("Node-0 not representative"); | ||
6746 | } | ||
6747 | if (next_distance != curr_distance) { | ||
6748 | sched_domains_numa_distance[level++] = next_distance; | ||
6749 | sched_domains_numa_levels = level; | ||
6750 | curr_distance = next_distance; | ||
6751 | } else break; | ||
6752 | } | ||
6753 | |||
6754 | /* | ||
6755 | * In case of sched_debug() we verify the above assumption. | ||
6756 | */ | ||
6757 | if (!sched_debug()) | ||
6758 | break; | ||
6759 | } | ||
6760 | |||
6761 | if (!level) | ||
6762 | return; | ||
6763 | |||
6764 | /* | ||
6765 | * 'level' contains the number of unique distances, excluding the | ||
6766 | * identity distance node_distance(i,i). | ||
6767 | * | ||
6768 | * The sched_domains_numa_distance[] array includes the actual distance | ||
6769 | * numbers. | ||
6770 | */ | ||
6771 | |||
6772 | /* | ||
6773 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
6774 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
6775 | * the array will contain less then 'level' members. This could be | ||
6776 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
6777 | * in other functions. | ||
6778 | * | ||
6779 | * We reset it to 'level' at the end of this function. | ||
6780 | */ | ||
6781 | sched_domains_numa_levels = 0; | ||
6782 | |||
6783 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6784 | if (!sched_domains_numa_masks) | ||
6785 | return; | ||
6786 | |||
6787 | /* | ||
6788 | * Now for each level, construct a mask per node which contains all | ||
6789 | * cpus of nodes that are that many hops away from us. | ||
6790 | */ | ||
6791 | for (i = 0; i < level; i++) { | ||
6792 | sched_domains_numa_masks[i] = | ||
6793 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6794 | if (!sched_domains_numa_masks[i]) | ||
6795 | return; | ||
6796 | |||
6797 | for (j = 0; j < nr_node_ids; j++) { | ||
6798 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6799 | if (!mask) | ||
6800 | return; | ||
6801 | |||
6802 | sched_domains_numa_masks[i][j] = mask; | ||
6803 | |||
6804 | for_each_node(k) { | ||
6805 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6806 | continue; | ||
6807 | |||
6808 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6809 | } | ||
6810 | } | ||
6811 | } | ||
6812 | |||
6813 | /* Compute default topology size */ | ||
6814 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
6815 | |||
6816 | tl = kzalloc((i + level + 1) * | ||
6817 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6818 | if (!tl) | ||
6819 | return; | ||
6820 | |||
6821 | /* | ||
6822 | * Copy the default topology bits.. | ||
6823 | */ | ||
6824 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
6825 | tl[i] = sched_domain_topology[i]; | ||
6826 | |||
6827 | /* | ||
6828 | * .. and append 'j' levels of NUMA goodness. | ||
6829 | */ | ||
6830 | for (j = 0; j < level; i++, j++) { | ||
6831 | tl[i] = (struct sched_domain_topology_level){ | ||
6832 | .mask = sd_numa_mask, | ||
6833 | .sd_flags = cpu_numa_flags, | ||
6834 | .flags = SDTL_OVERLAP, | ||
6835 | .numa_level = j, | ||
6836 | SD_INIT_NAME(NUMA) | ||
6837 | }; | ||
6838 | } | ||
6839 | |||
6840 | sched_domain_topology = tl; | ||
6841 | |||
6842 | sched_domains_numa_levels = level; | ||
6843 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
6844 | |||
6845 | init_numa_topology_type(); | ||
6846 | } | ||
6847 | |||
6848 | static void sched_domains_numa_masks_set(unsigned int cpu) | ||
6849 | { | ||
6850 | int node = cpu_to_node(cpu); | ||
6851 | int i, j; | ||
6852 | |||
6853 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6854 | for (j = 0; j < nr_node_ids; j++) { | ||
6855 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
6856 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6857 | } | ||
6858 | } | ||
6859 | } | ||
6860 | |||
6861 | static void sched_domains_numa_masks_clear(unsigned int cpu) | ||
6862 | { | ||
6863 | int i, j; | ||
6864 | |||
6865 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6866 | for (j = 0; j < nr_node_ids; j++) | ||
6867 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6868 | } | ||
6869 | } | ||
6870 | |||
6871 | #else | ||
6872 | static inline void sched_init_numa(void) { } | ||
6873 | static void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
6874 | static void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
6875 | #endif /* CONFIG_NUMA */ | ||
6876 | |||
6877 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
6878 | { | ||
6879 | struct sched_domain_topology_level *tl; | ||
6880 | int j; | ||
6881 | |||
6882 | for_each_sd_topology(tl) { | ||
6883 | struct sd_data *sdd = &tl->data; | ||
6884 | |||
6885 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
6886 | if (!sdd->sd) | ||
6887 | return -ENOMEM; | ||
6888 | |||
6889 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6890 | if (!sdd->sds) | ||
6891 | return -ENOMEM; | ||
6892 | |||
6893 | sdd->sg = alloc_percpu(struct sched_group *); | ||
6894 | if (!sdd->sg) | ||
6895 | return -ENOMEM; | ||
6896 | |||
6897 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
6898 | if (!sdd->sgc) | ||
6899 | return -ENOMEM; | ||
6900 | |||
6901 | for_each_cpu(j, cpu_map) { | ||
6902 | struct sched_domain *sd; | ||
6903 | struct sched_domain_shared *sds; | ||
6904 | struct sched_group *sg; | ||
6905 | struct sched_group_capacity *sgc; | ||
6906 | |||
6907 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
6908 | GFP_KERNEL, cpu_to_node(j)); | ||
6909 | if (!sd) | ||
6910 | return -ENOMEM; | ||
6911 | |||
6912 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
6913 | |||
6914 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6915 | GFP_KERNEL, cpu_to_node(j)); | ||
6916 | if (!sds) | ||
6917 | return -ENOMEM; | ||
6918 | |||
6919 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6920 | |||
6921 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6922 | GFP_KERNEL, cpu_to_node(j)); | ||
6923 | if (!sg) | ||
6924 | return -ENOMEM; | ||
6925 | |||
6926 | sg->next = sg; | ||
6927 | |||
6928 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
6929 | |||
6930 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
6931 | GFP_KERNEL, cpu_to_node(j)); | ||
6932 | if (!sgc) | ||
6933 | return -ENOMEM; | ||
6934 | |||
6935 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
6936 | } | ||
6937 | } | ||
6938 | |||
6939 | return 0; | ||
6940 | } | ||
6941 | |||
6942 | static void __sdt_free(const struct cpumask *cpu_map) | ||
6943 | { | ||
6944 | struct sched_domain_topology_level *tl; | ||
6945 | int j; | ||
6946 | |||
6947 | for_each_sd_topology(tl) { | ||
6948 | struct sd_data *sdd = &tl->data; | ||
6949 | |||
6950 | for_each_cpu(j, cpu_map) { | ||
6951 | struct sched_domain *sd; | ||
6952 | |||
6953 | if (sdd->sd) { | ||
6954 | sd = *per_cpu_ptr(sdd->sd, j); | ||
6955 | if (sd && (sd->flags & SD_OVERLAP)) | ||
6956 | free_sched_groups(sd->groups, 0); | ||
6957 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6958 | } | ||
6959 | |||
6960 | if (sdd->sds) | ||
6961 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
6962 | if (sdd->sg) | ||
6963 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6964 | if (sdd->sgc) | ||
6965 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
6966 | } | ||
6967 | free_percpu(sdd->sd); | ||
6968 | sdd->sd = NULL; | ||
6969 | free_percpu(sdd->sds); | ||
6970 | sdd->sds = NULL; | ||
6971 | free_percpu(sdd->sg); | ||
6972 | sdd->sg = NULL; | ||
6973 | free_percpu(sdd->sgc); | ||
6974 | sdd->sgc = NULL; | ||
6975 | } | ||
6976 | } | ||
6977 | |||
6978 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
6979 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
6980 | struct sched_domain *child, int cpu) | ||
6981 | { | ||
6982 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
6983 | |||
6984 | if (child) { | ||
6985 | sd->level = child->level + 1; | ||
6986 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
6987 | child->parent = sd; | ||
6988 | |||
6989 | if (!cpumask_subset(sched_domain_span(child), | ||
6990 | sched_domain_span(sd))) { | ||
6991 | pr_err("BUG: arch topology borken\n"); | ||
6992 | #ifdef CONFIG_SCHED_DEBUG | ||
6993 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
6994 | child->name, sd->name); | ||
6995 | #endif | ||
6996 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
6997 | cpumask_or(sched_domain_span(sd), | ||
6998 | sched_domain_span(sd), | ||
6999 | sched_domain_span(child)); | ||
7000 | } | ||
7001 | |||
7002 | } | ||
7003 | set_domain_attribute(sd, attr); | ||
7004 | |||
7005 | return sd; | ||
7006 | } | ||
7007 | |||
7008 | /* | 5682 | /* |
7009 | * Build sched domains for a given set of cpus and attach the sched domains | 5683 | * used to mark begin/end of suspend/resume: |
7010 | * to the individual cpus | ||
7011 | */ | 5684 | */ |
7012 | static int build_sched_domains(const struct cpumask *cpu_map, | 5685 | static int num_cpus_frozen; |
7013 | struct sched_domain_attr *attr) | ||
7014 | { | ||
7015 | enum s_alloc alloc_state; | ||
7016 | struct sched_domain *sd; | ||
7017 | struct s_data d; | ||
7018 | struct rq *rq = NULL; | ||
7019 | int i, ret = -ENOMEM; | ||
7020 | |||
7021 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
7022 | if (alloc_state != sa_rootdomain) | ||
7023 | goto error; | ||
7024 | |||
7025 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7026 | for_each_cpu(i, cpu_map) { | ||
7027 | struct sched_domain_topology_level *tl; | ||
7028 | |||
7029 | sd = NULL; | ||
7030 | for_each_sd_topology(tl) { | ||
7031 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
7032 | if (tl == sched_domain_topology) | ||
7033 | *per_cpu_ptr(d.sd, i) = sd; | ||
7034 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
7035 | sd->flags |= SD_OVERLAP; | ||
7036 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
7037 | break; | ||
7038 | } | ||
7039 | } | ||
7040 | |||
7041 | /* Build the groups for the domains */ | ||
7042 | for_each_cpu(i, cpu_map) { | ||
7043 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7044 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
7045 | if (sd->flags & SD_OVERLAP) { | ||
7046 | if (build_overlap_sched_groups(sd, i)) | ||
7047 | goto error; | ||
7048 | } else { | ||
7049 | if (build_sched_groups(sd, i)) | ||
7050 | goto error; | ||
7051 | } | ||
7052 | } | ||
7053 | } | ||
7054 | |||
7055 | /* Calculate CPU capacity for physical packages and nodes */ | ||
7056 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
7057 | if (!cpumask_test_cpu(i, cpu_map)) | ||
7058 | continue; | ||
7059 | |||
7060 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7061 | claim_allocations(i, sd); | ||
7062 | init_sched_groups_capacity(i, sd); | ||
7063 | } | ||
7064 | } | ||
7065 | |||
7066 | /* Attach the domains */ | ||
7067 | rcu_read_lock(); | ||
7068 | for_each_cpu(i, cpu_map) { | ||
7069 | rq = cpu_rq(i); | ||
7070 | sd = *per_cpu_ptr(d.sd, i); | ||
7071 | |||
7072 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7073 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7074 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7075 | |||
7076 | cpu_attach_domain(sd, d.rd, i); | ||
7077 | } | ||
7078 | rcu_read_unlock(); | ||
7079 | |||
7080 | if (rq && sched_debug_enabled) { | ||
7081 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7082 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7083 | } | ||
7084 | |||
7085 | ret = 0; | ||
7086 | error: | ||
7087 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
7088 | return ret; | ||
7089 | } | ||
7090 | |||
7091 | static cpumask_var_t *doms_cur; /* current sched domains */ | ||
7092 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | ||
7093 | static struct sched_domain_attr *dattr_cur; | ||
7094 | /* attribues of custom domains in 'doms_cur' */ | ||
7095 | |||
7096 | /* | ||
7097 | * Special case: If a kmalloc of a doms_cur partition (array of | ||
7098 | * cpumask) fails, then fallback to a single sched domain, | ||
7099 | * as determined by the single cpumask fallback_doms. | ||
7100 | */ | ||
7101 | static cpumask_var_t fallback_doms; | ||
7102 | |||
7103 | /* | ||
7104 | * arch_update_cpu_topology lets virtualized architectures update the | ||
7105 | * cpu core maps. It is supposed to return 1 if the topology changed | ||
7106 | * or 0 if it stayed the same. | ||
7107 | */ | ||
7108 | int __weak arch_update_cpu_topology(void) | ||
7109 | { | ||
7110 | return 0; | ||
7111 | } | ||
7112 | |||
7113 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
7114 | { | ||
7115 | int i; | ||
7116 | cpumask_var_t *doms; | ||
7117 | |||
7118 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
7119 | if (!doms) | ||
7120 | return NULL; | ||
7121 | for (i = 0; i < ndoms; i++) { | ||
7122 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
7123 | free_sched_domains(doms, i); | ||
7124 | return NULL; | ||
7125 | } | ||
7126 | } | ||
7127 | return doms; | ||
7128 | } | ||
7129 | |||
7130 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
7131 | { | ||
7132 | unsigned int i; | ||
7133 | for (i = 0; i < ndoms; i++) | ||
7134 | free_cpumask_var(doms[i]); | ||
7135 | kfree(doms); | ||
7136 | } | ||
7137 | |||
7138 | /* | ||
7139 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
7140 | * For now this just excludes isolated cpus, but could be used to | ||
7141 | * exclude other special cases in the future. | ||
7142 | */ | ||
7143 | static int init_sched_domains(const struct cpumask *cpu_map) | ||
7144 | { | ||
7145 | int err; | ||
7146 | |||
7147 | arch_update_cpu_topology(); | ||
7148 | ndoms_cur = 1; | ||
7149 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
7150 | if (!doms_cur) | ||
7151 | doms_cur = &fallback_doms; | ||
7152 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
7153 | err = build_sched_domains(doms_cur[0], NULL); | ||
7154 | register_sched_domain_sysctl(); | ||
7155 | |||
7156 | return err; | ||
7157 | } | ||
7158 | |||
7159 | /* | ||
7160 | * Detach sched domains from a group of cpus specified in cpu_map | ||
7161 | * These cpus will now be attached to the NULL domain | ||
7162 | */ | ||
7163 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
7164 | { | ||
7165 | int i; | ||
7166 | |||
7167 | rcu_read_lock(); | ||
7168 | for_each_cpu(i, cpu_map) | ||
7169 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
7170 | rcu_read_unlock(); | ||
7171 | } | ||
7172 | |||
7173 | /* handle null as "default" */ | ||
7174 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7175 | struct sched_domain_attr *new, int idx_new) | ||
7176 | { | ||
7177 | struct sched_domain_attr tmp; | ||
7178 | |||
7179 | /* fast path */ | ||
7180 | if (!new && !cur) | ||
7181 | return 1; | ||
7182 | |||
7183 | tmp = SD_ATTR_INIT; | ||
7184 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7185 | new ? (new + idx_new) : &tmp, | ||
7186 | sizeof(struct sched_domain_attr)); | ||
7187 | } | ||
7188 | |||
7189 | /* | ||
7190 | * Partition sched domains as specified by the 'ndoms_new' | ||
7191 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
7192 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
7193 | * It destroys each deleted domain and builds each new domain. | ||
7194 | * | ||
7195 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
7196 | * The masks don't intersect (don't overlap.) We should setup one | ||
7197 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
7198 | * not be load balanced. If the same cpumask appears both in the | ||
7199 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
7200 | * it as it is. | ||
7201 | * | ||
7202 | * The passed in 'doms_new' should be allocated using | ||
7203 | * alloc_sched_domains. This routine takes ownership of it and will | ||
7204 | * free_sched_domains it when done with it. If the caller failed the | ||
7205 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
7206 | * and partition_sched_domains() will fallback to the single partition | ||
7207 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
7208 | * | ||
7209 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
7210 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
7211 | * and it will not create the default domain. | ||
7212 | * | ||
7213 | * Call with hotplug lock held | ||
7214 | */ | ||
7215 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
7216 | struct sched_domain_attr *dattr_new) | ||
7217 | { | ||
7218 | int i, j, n; | ||
7219 | int new_topology; | ||
7220 | |||
7221 | mutex_lock(&sched_domains_mutex); | ||
7222 | |||
7223 | /* always unregister in case we don't destroy any domains */ | ||
7224 | unregister_sched_domain_sysctl(); | ||
7225 | |||
7226 | /* Let architecture update cpu core mappings. */ | ||
7227 | new_topology = arch_update_cpu_topology(); | ||
7228 | |||
7229 | n = doms_new ? ndoms_new : 0; | ||
7230 | |||
7231 | /* Destroy deleted domains */ | ||
7232 | for (i = 0; i < ndoms_cur; i++) { | ||
7233 | for (j = 0; j < n && !new_topology; j++) { | ||
7234 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
7235 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
7236 | goto match1; | ||
7237 | } | ||
7238 | /* no match - a current sched domain not in new doms_new[] */ | ||
7239 | detach_destroy_domains(doms_cur[i]); | ||
7240 | match1: | ||
7241 | ; | ||
7242 | } | ||
7243 | |||
7244 | n = ndoms_cur; | ||
7245 | if (doms_new == NULL) { | ||
7246 | n = 0; | ||
7247 | doms_new = &fallback_doms; | ||
7248 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
7249 | WARN_ON_ONCE(dattr_new); | ||
7250 | } | ||
7251 | |||
7252 | /* Build new domains */ | ||
7253 | for (i = 0; i < ndoms_new; i++) { | ||
7254 | for (j = 0; j < n && !new_topology; j++) { | ||
7255 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
7256 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
7257 | goto match2; | ||
7258 | } | ||
7259 | /* no match - add a new doms_new */ | ||
7260 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
7261 | match2: | ||
7262 | ; | ||
7263 | } | ||
7264 | |||
7265 | /* Remember the new sched domains */ | ||
7266 | if (doms_cur != &fallback_doms) | ||
7267 | free_sched_domains(doms_cur, ndoms_cur); | ||
7268 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
7269 | doms_cur = doms_new; | ||
7270 | dattr_cur = dattr_new; | ||
7271 | ndoms_cur = ndoms_new; | ||
7272 | |||
7273 | register_sched_domain_sysctl(); | ||
7274 | |||
7275 | mutex_unlock(&sched_domains_mutex); | ||
7276 | } | ||
7277 | |||
7278 | static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | ||
7279 | 5686 | ||
7280 | /* | 5687 | /* |
7281 | * Update cpusets according to cpu_active mask. If cpusets are | 5688 | * Update cpusets according to cpu_active mask. If cpusets are |
@@ -7352,7 +5759,7 @@ int sched_cpu_activate(unsigned int cpu) | |||
7352 | * Put the rq online, if not already. This happens: | 5759 | * Put the rq online, if not already. This happens: |
7353 | * | 5760 | * |
7354 | * 1) In the early boot process, because we build the real domains | 5761 | * 1) In the early boot process, because we build the real domains |
7355 | * after all cpus have been brought up. | 5762 | * after all CPUs have been brought up. |
7356 | * | 5763 | * |
7357 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the | 5764 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the |
7358 | * domains. | 5765 | * domains. |
@@ -7467,7 +5874,7 @@ void __init sched_init_smp(void) | |||
7467 | 5874 | ||
7468 | /* | 5875 | /* |
7469 | * There's no userspace yet to cause hotplug operations; hence all the | 5876 | * There's no userspace yet to cause hotplug operations; hence all the |
7470 | * cpu masks are stable and all blatant races in the below code cannot | 5877 | * CPU masks are stable and all blatant races in the below code cannot |
7471 | * happen. | 5878 | * happen. |
7472 | */ | 5879 | */ |
7473 | mutex_lock(&sched_domains_mutex); | 5880 | mutex_lock(&sched_domains_mutex); |
@@ -7487,6 +5894,7 @@ void __init sched_init_smp(void) | |||
7487 | init_sched_dl_class(); | 5894 | init_sched_dl_class(); |
7488 | 5895 | ||
7489 | sched_init_smt(); | 5896 | sched_init_smt(); |
5897 | sched_clock_init_late(); | ||
7490 | 5898 | ||
7491 | sched_smp_initialized = true; | 5899 | sched_smp_initialized = true; |
7492 | } | 5900 | } |
@@ -7502,6 +5910,7 @@ early_initcall(migration_init); | |||
7502 | void __init sched_init_smp(void) | 5910 | void __init sched_init_smp(void) |
7503 | { | 5911 | { |
7504 | sched_init_granularity(); | 5912 | sched_init_granularity(); |
5913 | sched_clock_init_late(); | ||
7505 | } | 5914 | } |
7506 | #endif /* CONFIG_SMP */ | 5915 | #endif /* CONFIG_SMP */ |
7507 | 5916 | ||
@@ -7545,6 +5954,8 @@ void __init sched_init(void) | |||
7545 | int i, j; | 5954 | int i, j; |
7546 | unsigned long alloc_size = 0, ptr; | 5955 | unsigned long alloc_size = 0, ptr; |
7547 | 5956 | ||
5957 | sched_clock_init(); | ||
5958 | |||
7548 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | 5959 | for (i = 0; i < WAIT_TABLE_SIZE; i++) |
7549 | init_waitqueue_head(bit_wait_table + i); | 5960 | init_waitqueue_head(bit_wait_table + i); |
7550 | 5961 | ||
@@ -7583,10 +5994,8 @@ void __init sched_init(void) | |||
7583 | } | 5994 | } |
7584 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 5995 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7585 | 5996 | ||
7586 | init_rt_bandwidth(&def_rt_bandwidth, | 5997 | init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); |
7587 | global_rt_period(), global_rt_runtime()); | 5998 | init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); |
7588 | init_dl_bandwidth(&def_dl_bandwidth, | ||
7589 | global_rt_period(), global_rt_runtime()); | ||
7590 | 5999 | ||
7591 | #ifdef CONFIG_SMP | 6000 | #ifdef CONFIG_SMP |
7592 | init_defrootdomain(); | 6001 | init_defrootdomain(); |
@@ -7622,18 +6031,18 @@ void __init sched_init(void) | |||
7622 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6031 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7623 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | 6032 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; |
7624 | /* | 6033 | /* |
7625 | * How much cpu bandwidth does root_task_group get? | 6034 | * How much CPU bandwidth does root_task_group get? |
7626 | * | 6035 | * |
7627 | * In case of task-groups formed thr' the cgroup filesystem, it | 6036 | * In case of task-groups formed thr' the cgroup filesystem, it |
7628 | * gets 100% of the cpu resources in the system. This overall | 6037 | * gets 100% of the CPU resources in the system. This overall |
7629 | * system cpu resource is divided among the tasks of | 6038 | * system CPU resource is divided among the tasks of |
7630 | * root_task_group and its child task-groups in a fair manner, | 6039 | * root_task_group and its child task-groups in a fair manner, |
7631 | * based on each entity's (task or task-group's) weight | 6040 | * based on each entity's (task or task-group's) weight |
7632 | * (se->load.weight). | 6041 | * (se->load.weight). |
7633 | * | 6042 | * |
7634 | * In other words, if root_task_group has 10 tasks of weight | 6043 | * In other words, if root_task_group has 10 tasks of weight |
7635 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 6044 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7636 | * then A0's share of the cpu resource is: | 6045 | * then A0's share of the CPU resource is: |
7637 | * | 6046 | * |
7638 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 6047 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7639 | * | 6048 | * |
@@ -7742,10 +6151,14 @@ EXPORT_SYMBOL(__might_sleep); | |||
7742 | 6151 | ||
7743 | void ___might_sleep(const char *file, int line, int preempt_offset) | 6152 | void ___might_sleep(const char *file, int line, int preempt_offset) |
7744 | { | 6153 | { |
7745 | static unsigned long prev_jiffy; /* ratelimiting */ | 6154 | /* Ratelimiting timestamp: */ |
6155 | static unsigned long prev_jiffy; | ||
6156 | |||
7746 | unsigned long preempt_disable_ip; | 6157 | unsigned long preempt_disable_ip; |
7747 | 6158 | ||
7748 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 6159 | /* WARN_ON_ONCE() by default, no rate limit required: */ |
6160 | rcu_sleep_check(); | ||
6161 | |||
7749 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 6162 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
7750 | !is_idle_task(current)) || | 6163 | !is_idle_task(current)) || |
7751 | system_state != SYSTEM_RUNNING || oops_in_progress) | 6164 | system_state != SYSTEM_RUNNING || oops_in_progress) |
@@ -7754,7 +6167,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7754 | return; | 6167 | return; |
7755 | prev_jiffy = jiffies; | 6168 | prev_jiffy = jiffies; |
7756 | 6169 | ||
7757 | /* Save this before calling printk(), since that will clobber it */ | 6170 | /* Save this before calling printk(), since that will clobber it: */ |
7758 | preempt_disable_ip = get_preempt_disable_ip(current); | 6171 | preempt_disable_ip = get_preempt_disable_ip(current); |
7759 | 6172 | ||
7760 | printk(KERN_ERR | 6173 | printk(KERN_ERR |
@@ -7833,7 +6246,7 @@ void normalize_rt_tasks(void) | |||
7833 | */ | 6246 | */ |
7834 | 6247 | ||
7835 | /** | 6248 | /** |
7836 | * curr_task - return the current task for a given cpu. | 6249 | * curr_task - return the current task for a given CPU. |
7837 | * @cpu: the processor in question. | 6250 | * @cpu: the processor in question. |
7838 | * | 6251 | * |
7839 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6252 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
@@ -7849,13 +6262,13 @@ struct task_struct *curr_task(int cpu) | |||
7849 | 6262 | ||
7850 | #ifdef CONFIG_IA64 | 6263 | #ifdef CONFIG_IA64 |
7851 | /** | 6264 | /** |
7852 | * set_curr_task - set the current task for a given cpu. | 6265 | * set_curr_task - set the current task for a given CPU. |
7853 | * @cpu: the processor in question. | 6266 | * @cpu: the processor in question. |
7854 | * @p: the task pointer to set. | 6267 | * @p: the task pointer to set. |
7855 | * | 6268 | * |
7856 | * Description: This function must only be used when non-maskable interrupts | 6269 | * Description: This function must only be used when non-maskable interrupts |
7857 | * are serviced on a separate stack. It allows the architecture to switch the | 6270 | * are serviced on a separate stack. It allows the architecture to switch the |
7858 | * notion of the current task on a cpu in a non-blocking manner. This function | 6271 | * notion of the current task on a CPU in a non-blocking manner. This function |
7859 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6272 | * must be called with all CPU's synchronized, and interrupts disabled, the |
7860 | * and caller must save the original value of the current task (see | 6273 | * and caller must save the original value of the current task (see |
7861 | * curr_task() above) and restore that value before reenabling interrupts and | 6274 | * curr_task() above) and restore that value before reenabling interrupts and |
@@ -7911,7 +6324,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7911 | spin_lock_irqsave(&task_group_lock, flags); | 6324 | spin_lock_irqsave(&task_group_lock, flags); |
7912 | list_add_rcu(&tg->list, &task_groups); | 6325 | list_add_rcu(&tg->list, &task_groups); |
7913 | 6326 | ||
7914 | WARN_ON(!parent); /* root should already exist */ | 6327 | /* Root should already exist: */ |
6328 | WARN_ON(!parent); | ||
7915 | 6329 | ||
7916 | tg->parent = parent; | 6330 | tg->parent = parent; |
7917 | INIT_LIST_HEAD(&tg->children); | 6331 | INIT_LIST_HEAD(&tg->children); |
@@ -7924,13 +6338,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7924 | /* rcu callback to free various structures associated with a task group */ | 6338 | /* rcu callback to free various structures associated with a task group */ |
7925 | static void sched_free_group_rcu(struct rcu_head *rhp) | 6339 | static void sched_free_group_rcu(struct rcu_head *rhp) |
7926 | { | 6340 | { |
7927 | /* now it should be safe to free those cfs_rqs */ | 6341 | /* Now it should be safe to free those cfs_rqs: */ |
7928 | sched_free_group(container_of(rhp, struct task_group, rcu)); | 6342 | sched_free_group(container_of(rhp, struct task_group, rcu)); |
7929 | } | 6343 | } |
7930 | 6344 | ||
7931 | void sched_destroy_group(struct task_group *tg) | 6345 | void sched_destroy_group(struct task_group *tg) |
7932 | { | 6346 | { |
7933 | /* wait for possible concurrent references to cfs_rqs complete */ | 6347 | /* Wait for possible concurrent references to cfs_rqs complete: */ |
7934 | call_rcu(&tg->rcu, sched_free_group_rcu); | 6348 | call_rcu(&tg->rcu, sched_free_group_rcu); |
7935 | } | 6349 | } |
7936 | 6350 | ||
@@ -7938,7 +6352,7 @@ void sched_offline_group(struct task_group *tg) | |||
7938 | { | 6352 | { |
7939 | unsigned long flags; | 6353 | unsigned long flags; |
7940 | 6354 | ||
7941 | /* end participation in shares distribution */ | 6355 | /* End participation in shares distribution: */ |
7942 | unregister_fair_sched_group(tg); | 6356 | unregister_fair_sched_group(tg); |
7943 | 6357 | ||
7944 | spin_lock_irqsave(&task_group_lock, flags); | 6358 | spin_lock_irqsave(&task_group_lock, flags); |
@@ -7983,20 +6397,21 @@ void sched_move_task(struct task_struct *tsk) | |||
7983 | struct rq *rq; | 6397 | struct rq *rq; |
7984 | 6398 | ||
7985 | rq = task_rq_lock(tsk, &rf); | 6399 | rq = task_rq_lock(tsk, &rf); |
6400 | update_rq_clock(rq); | ||
7986 | 6401 | ||
7987 | running = task_current(rq, tsk); | 6402 | running = task_current(rq, tsk); |
7988 | queued = task_on_rq_queued(tsk); | 6403 | queued = task_on_rq_queued(tsk); |
7989 | 6404 | ||
7990 | if (queued) | 6405 | if (queued) |
7991 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | 6406 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); |
7992 | if (unlikely(running)) | 6407 | if (running) |
7993 | put_prev_task(rq, tsk); | 6408 | put_prev_task(rq, tsk); |
7994 | 6409 | ||
7995 | sched_change_group(tsk, TASK_MOVE_GROUP); | 6410 | sched_change_group(tsk, TASK_MOVE_GROUP); |
7996 | 6411 | ||
7997 | if (queued) | 6412 | if (queued) |
7998 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 6413 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7999 | if (unlikely(running)) | 6414 | if (running) |
8000 | set_curr_task(rq, tsk); | 6415 | set_curr_task(rq, tsk); |
8001 | 6416 | ||
8002 | task_rq_unlock(rq, tsk, &rf); | 6417 | task_rq_unlock(rq, tsk, &rf); |
@@ -8366,11 +6781,14 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
8366 | 6781 | ||
8367 | mutex_lock(&mutex); | 6782 | mutex_lock(&mutex); |
8368 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 6783 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
8369 | /* make sure that internally we keep jiffies */ | 6784 | /* |
8370 | /* also, writing zero resets timeslice to default */ | 6785 | * Make sure that internally we keep jiffies. |
6786 | * Also, writing zero resets the timeslice to default: | ||
6787 | */ | ||
8371 | if (!ret && write) { | 6788 | if (!ret && write) { |
8372 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 6789 | sched_rr_timeslice = |
8373 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 6790 | sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : |
6791 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | ||
8374 | } | 6792 | } |
8375 | mutex_unlock(&mutex); | 6793 | mutex_unlock(&mutex); |
8376 | return ret; | 6794 | return ret; |
@@ -8431,6 +6849,7 @@ static void cpu_cgroup_fork(struct task_struct *task) | |||
8431 | 6849 | ||
8432 | rq = task_rq_lock(task, &rf); | 6850 | rq = task_rq_lock(task, &rf); |
8433 | 6851 | ||
6852 | update_rq_clock(rq); | ||
8434 | sched_change_group(task, TASK_SET_GROUP); | 6853 | sched_change_group(task, TASK_SET_GROUP); |
8435 | 6854 | ||
8436 | task_rq_unlock(rq, task, &rf); | 6855 | task_rq_unlock(rq, task, &rf); |
@@ -8550,9 +6969,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
8550 | cfs_b->quota = quota; | 6969 | cfs_b->quota = quota; |
8551 | 6970 | ||
8552 | __refill_cfs_bandwidth_runtime(cfs_b); | 6971 | __refill_cfs_bandwidth_runtime(cfs_b); |
8553 | /* restart the period timer (if active) to handle new period expiry */ | 6972 | |
6973 | /* Restart the period timer (if active) to handle new period expiry: */ | ||
8554 | if (runtime_enabled) | 6974 | if (runtime_enabled) |
8555 | start_cfs_bandwidth(cfs_b); | 6975 | start_cfs_bandwidth(cfs_b); |
6976 | |||
8556 | raw_spin_unlock_irq(&cfs_b->lock); | 6977 | raw_spin_unlock_irq(&cfs_b->lock); |
8557 | 6978 | ||
8558 | for_each_online_cpu(i) { | 6979 | for_each_online_cpu(i) { |
@@ -8690,8 +7111,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
8690 | parent_quota = parent_b->hierarchical_quota; | 7111 | parent_quota = parent_b->hierarchical_quota; |
8691 | 7112 | ||
8692 | /* | 7113 | /* |
8693 | * ensure max(child_quota) <= parent_quota, inherit when no | 7114 | * Ensure max(child_quota) <= parent_quota, inherit when no |
8694 | * limit is set | 7115 | * limit is set: |
8695 | */ | 7116 | */ |
8696 | if (quota == RUNTIME_INF) | 7117 | if (quota == RUNTIME_INF) |
8697 | quota = parent_quota; | 7118 | quota = parent_quota; |
@@ -8800,7 +7221,7 @@ static struct cftype cpu_files[] = { | |||
8800 | .write_u64 = cpu_rt_period_write_uint, | 7221 | .write_u64 = cpu_rt_period_write_uint, |
8801 | }, | 7222 | }, |
8802 | #endif | 7223 | #endif |
8803 | { } /* terminate */ | 7224 | { } /* Terminate */ |
8804 | }; | 7225 | }; |
8805 | 7226 | ||
8806 | struct cgroup_subsys cpu_cgrp_subsys = { | 7227 | struct cgroup_subsys cpu_cgrp_subsys = { |