diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-11 18:13:38 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-11 18:13:38 -0500 |
commit | af345201ea948d0976d775958d8aa22fe5e5ba58 (patch) | |
tree | 2badae3f02ff9415c86a2188b0b5d565dc257a6c | |
parent | 4bd20db2c027eab7490e3c0466734738bef2dd24 (diff) | |
parent | 0905f04eb21fc1c2e690bed5d0418a061d56c225 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- tickless load average calculation enhancements (Byungchul Park)
- vtime handling enhancements (Frederic Weisbecker)
- scalability improvement via properly aligning a key structure field
(Jiri Olsa)
- various stop_machine() fixes (Oleg Nesterov)
- sched/numa enhancement (Rik van Riel)
- various fixes and improvements (Andi Kleen, Dietmar Eggemann,
Geliang Tang, Hiroshi Shimamoto, Joonwoo Park, Peter Zijlstra,
Waiman Long, Wanpeng Li, Yuyang Du)"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()
sched/core: Move sched_entity::avg into separate cache line
x86/fpu: Properly align size in CHECK_MEMBER_AT_END_OF() macro
sched/deadline: Fix the earliest_dl.next logic
sched/fair: Disable the task group load_avg update for the root_task_group
sched/fair: Move the cache-hot 'load_avg' variable into its own cacheline
sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats()
sched/core: Move the sched_to_prio[] arrays out of line
sched/cputime: Convert vtime_seqlock to seqcount
sched/cputime: Introduce vtime accounting check for readers
sched/cputime: Rename vtime_accounting_enabled() to vtime_accounting_cpu_enabled()
sched/cputime: Correctly handle task guest time on housekeepers
sched/cputime: Clarify vtime symbols and document them
sched/cputime: Remove extra cost in task_cputime()
sched/fair: Make it possible to account fair load avg consistently
sched/fair: Modify the comment about lock assumptions in migrate_task_rq_fair()
stop_machine: Clean up the usage of the preemption counter in cpu_stopper_thread()
stop_machine: Shift the 'done != NULL' check from cpu_stop_signal_done() to callers
stop_machine: Kill cpu_stop_done->executed
stop_machine: Change __stop_cpus() to rely on cpu_stop_queue_work()
...
-rw-r--r-- | arch/x86/kernel/fpu/init.c | 13 | ||||
-rw-r--r-- | include/linux/context_tracking.h | 4 | ||||
-rw-r--r-- | include/linux/init_task.h | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 20 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 7 | ||||
-rw-r--r-- | include/linux/vtime.h | 25 | ||||
-rw-r--r-- | include/linux/wait.h | 30 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 76 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 74 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 59 | ||||
-rw-r--r-- | kernel/sched/fair.c | 312 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 68 | ||||
-rw-r--r-- | kernel/stop_machine.c | 84 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 10 |
17 files changed, 485 insertions, 306 deletions
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index be39b5fde4b9..8e839e7f5e2f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c | |||
@@ -143,9 +143,18 @@ static void __init fpu__init_system_generic(void) | |||
143 | unsigned int xstate_size; | 143 | unsigned int xstate_size; |
144 | EXPORT_SYMBOL_GPL(xstate_size); | 144 | EXPORT_SYMBOL_GPL(xstate_size); |
145 | 145 | ||
146 | /* Enforce that 'MEMBER' is the last field of 'TYPE': */ | 146 | /* Get alignment of the TYPE. */ |
147 | #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) | ||
148 | |||
149 | /* | ||
150 | * Enforce that 'MEMBER' is the last field of 'TYPE'. | ||
151 | * | ||
152 | * Align the computed size with alignment of the TYPE, | ||
153 | * because that's how C aligns structs. | ||
154 | */ | ||
147 | #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ | 155 | #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ |
148 | BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) | 156 | BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ |
157 | TYPE_ALIGN(TYPE))) | ||
149 | 158 | ||
150 | /* | 159 | /* |
151 | * We append the 'struct fpu' to the task_struct: | 160 | * We append the 'struct fpu' to the task_struct: |
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 68b575afe5f5..d259274238db 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h | |||
@@ -86,7 +86,7 @@ static inline void context_tracking_init(void) { } | |||
86 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 86 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
87 | static inline void guest_enter(void) | 87 | static inline void guest_enter(void) |
88 | { | 88 | { |
89 | if (vtime_accounting_enabled()) | 89 | if (vtime_accounting_cpu_enabled()) |
90 | vtime_guest_enter(current); | 90 | vtime_guest_enter(current); |
91 | else | 91 | else |
92 | current->flags |= PF_VCPU; | 92 | current->flags |= PF_VCPU; |
@@ -100,7 +100,7 @@ static inline void guest_exit(void) | |||
100 | if (context_tracking_is_enabled()) | 100 | if (context_tracking_is_enabled()) |
101 | __context_tracking_exit(CONTEXT_GUEST); | 101 | __context_tracking_exit(CONTEXT_GUEST); |
102 | 102 | ||
103 | if (vtime_accounting_enabled()) | 103 | if (vtime_accounting_cpu_enabled()) |
104 | vtime_guest_exit(current); | 104 | vtime_guest_exit(current); |
105 | else | 105 | else |
106 | current->flags &= ~PF_VCPU; | 106 | current->flags &= ~PF_VCPU; |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1c1ff7e4faa4..f2cb8d45513d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -150,7 +150,7 @@ extern struct task_group root_task_group; | |||
150 | 150 | ||
151 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 151 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
152 | # define INIT_VTIME(tsk) \ | 152 | # define INIT_VTIME(tsk) \ |
153 | .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \ | 153 | .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ |
154 | .vtime_snap = 0, \ | 154 | .vtime_snap = 0, \ |
155 | .vtime_snap_whence = VTIME_SYS, | 155 | .vtime_snap_whence = VTIME_SYS, |
156 | #else | 156 | #else |
diff --git a/include/linux/sched.h b/include/linux/sched.h index fa39434e3fdd..0c0e78102850 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); | |||
177 | extern void calc_global_load(unsigned long ticks); | 177 | extern void calc_global_load(unsigned long ticks); |
178 | 178 | ||
179 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 179 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
180 | extern void update_cpu_load_nohz(void); | 180 | extern void update_cpu_load_nohz(int active); |
181 | #else | 181 | #else |
182 | static inline void update_cpu_load_nohz(void) { } | 182 | static inline void update_cpu_load_nohz(int active) { } |
183 | #endif | 183 | #endif |
184 | 184 | ||
185 | extern unsigned long get_parent_ip(unsigned long addr); | 185 | extern unsigned long get_parent_ip(unsigned long addr); |
@@ -1268,8 +1268,13 @@ struct sched_entity { | |||
1268 | #endif | 1268 | #endif |
1269 | 1269 | ||
1270 | #ifdef CONFIG_SMP | 1270 | #ifdef CONFIG_SMP |
1271 | /* Per entity load average tracking */ | 1271 | /* |
1272 | struct sched_avg avg; | 1272 | * Per entity load average tracking. |
1273 | * | ||
1274 | * Put into separate cache line so it does not | ||
1275 | * collide with read-mostly values above. | ||
1276 | */ | ||
1277 | struct sched_avg avg ____cacheline_aligned_in_smp; | ||
1273 | #endif | 1278 | #endif |
1274 | }; | 1279 | }; |
1275 | 1280 | ||
@@ -1520,11 +1525,14 @@ struct task_struct { | |||
1520 | cputime_t gtime; | 1525 | cputime_t gtime; |
1521 | struct prev_cputime prev_cputime; | 1526 | struct prev_cputime prev_cputime; |
1522 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1527 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
1523 | seqlock_t vtime_seqlock; | 1528 | seqcount_t vtime_seqcount; |
1524 | unsigned long long vtime_snap; | 1529 | unsigned long long vtime_snap; |
1525 | enum { | 1530 | enum { |
1526 | VTIME_SLEEPING = 0, | 1531 | /* Task is sleeping or running in a CPU with VTIME inactive */ |
1532 | VTIME_INACTIVE = 0, | ||
1533 | /* Task runs in userspace in a CPU with VTIME active */ | ||
1527 | VTIME_USER, | 1534 | VTIME_USER, |
1535 | /* Task runs in kernelspace in a CPU with VTIME active */ | ||
1528 | VTIME_SYS, | 1536 | VTIME_SYS, |
1529 | } vtime_snap_whence; | 1537 | } vtime_snap_whence; |
1530 | #endif | 1538 | #endif |
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0e1b1540597a..3cc9632dcc2a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h | |||
@@ -29,7 +29,7 @@ struct cpu_stop_work { | |||
29 | 29 | ||
30 | int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); | 30 | int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); |
31 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); | 31 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); |
32 | void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | 32 | bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, |
33 | struct cpu_stop_work *work_buf); | 33 | struct cpu_stop_work *work_buf); |
34 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | 34 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); |
35 | int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | 35 | int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); |
@@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work) | |||
65 | preempt_enable(); | 65 | preempt_enable(); |
66 | } | 66 | } |
67 | 67 | ||
68 | static inline void stop_one_cpu_nowait(unsigned int cpu, | 68 | static inline bool stop_one_cpu_nowait(unsigned int cpu, |
69 | cpu_stop_fn_t fn, void *arg, | 69 | cpu_stop_fn_t fn, void *arg, |
70 | struct cpu_stop_work *work_buf) | 70 | struct cpu_stop_work *work_buf) |
71 | { | 71 | { |
@@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu, | |||
74 | work_buf->fn = fn; | 74 | work_buf->fn = fn; |
75 | work_buf->arg = arg; | 75 | work_buf->arg = arg; |
76 | schedule_work(&work_buf->work); | 76 | schedule_work(&work_buf->work); |
77 | return true; | ||
77 | } | 78 | } |
79 | |||
80 | return false; | ||
78 | } | 81 | } |
79 | 82 | ||
80 | static inline int stop_cpus(const struct cpumask *cpumask, | 83 | static inline int stop_cpus(const struct cpumask *cpumask, |
diff --git a/include/linux/vtime.h b/include/linux/vtime.h index c5165fd256f9..fa2196990f84 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h | |||
@@ -10,16 +10,27 @@ | |||
10 | struct task_struct; | 10 | struct task_struct; |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * vtime_accounting_enabled() definitions/declarations | 13 | * vtime_accounting_cpu_enabled() definitions/declarations |
14 | */ | 14 | */ |
15 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 15 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
16 | static inline bool vtime_accounting_enabled(void) { return true; } | 16 | static inline bool vtime_accounting_cpu_enabled(void) { return true; } |
17 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 17 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
18 | 18 | ||
19 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 19 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
20 | /* | ||
21 | * Checks if vtime is enabled on some CPU. Cputime readers want to be careful | ||
22 | * in that case and compute the tickless cputime. | ||
23 | * For now vtime state is tied to context tracking. We might want to decouple | ||
24 | * those later if necessary. | ||
25 | */ | ||
20 | static inline bool vtime_accounting_enabled(void) | 26 | static inline bool vtime_accounting_enabled(void) |
21 | { | 27 | { |
22 | if (context_tracking_is_enabled()) { | 28 | return context_tracking_is_enabled(); |
29 | } | ||
30 | |||
31 | static inline bool vtime_accounting_cpu_enabled(void) | ||
32 | { | ||
33 | if (vtime_accounting_enabled()) { | ||
23 | if (context_tracking_cpu_is_enabled()) | 34 | if (context_tracking_cpu_is_enabled()) |
24 | return true; | 35 | return true; |
25 | } | 36 | } |
@@ -29,7 +40,7 @@ static inline bool vtime_accounting_enabled(void) | |||
29 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | 40 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
30 | 41 | ||
31 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 42 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
32 | static inline bool vtime_accounting_enabled(void) { return false; } | 43 | static inline bool vtime_accounting_cpu_enabled(void) { return false; } |
33 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 44 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ |
34 | 45 | ||
35 | 46 | ||
@@ -44,7 +55,7 @@ extern void vtime_task_switch(struct task_struct *prev); | |||
44 | extern void vtime_common_task_switch(struct task_struct *prev); | 55 | extern void vtime_common_task_switch(struct task_struct *prev); |
45 | static inline void vtime_task_switch(struct task_struct *prev) | 56 | static inline void vtime_task_switch(struct task_struct *prev) |
46 | { | 57 | { |
47 | if (vtime_accounting_enabled()) | 58 | if (vtime_accounting_cpu_enabled()) |
48 | vtime_common_task_switch(prev); | 59 | vtime_common_task_switch(prev); |
49 | } | 60 | } |
50 | #endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ | 61 | #endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ |
@@ -59,7 +70,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk); | |||
59 | extern void vtime_common_account_irq_enter(struct task_struct *tsk); | 70 | extern void vtime_common_account_irq_enter(struct task_struct *tsk); |
60 | static inline void vtime_account_irq_enter(struct task_struct *tsk) | 71 | static inline void vtime_account_irq_enter(struct task_struct *tsk) |
61 | { | 72 | { |
62 | if (vtime_accounting_enabled()) | 73 | if (vtime_accounting_cpu_enabled()) |
63 | vtime_common_account_irq_enter(tsk); | 74 | vtime_common_account_irq_enter(tsk); |
64 | } | 75 | } |
65 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 76 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
@@ -78,7 +89,7 @@ extern void vtime_gen_account_irq_exit(struct task_struct *tsk); | |||
78 | 89 | ||
79 | static inline void vtime_account_irq_exit(struct task_struct *tsk) | 90 | static inline void vtime_account_irq_exit(struct task_struct *tsk) |
80 | { | 91 | { |
81 | if (vtime_accounting_enabled()) | 92 | if (vtime_accounting_cpu_enabled()) |
82 | vtime_gen_account_irq_exit(tsk); | 93 | vtime_gen_account_irq_exit(tsk); |
83 | } | 94 | } |
84 | 95 | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index 513b36f04dfd..d2f4ec7dba7c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) | |||
102 | q->func = func; | 102 | q->func = func; |
103 | } | 103 | } |
104 | 104 | ||
105 | /** | ||
106 | * waitqueue_active -- locklessly test for waiters on the queue | ||
107 | * @q: the waitqueue to test for waiters | ||
108 | * | ||
109 | * returns true if the wait list is not empty | ||
110 | * | ||
111 | * NOTE: this function is lockless and requires care, incorrect usage _will_ | ||
112 | * lead to sporadic and non-obvious failure. | ||
113 | * | ||
114 | * Use either while holding wait_queue_head_t::lock or when used for wakeups | ||
115 | * with an extra smp_mb() like: | ||
116 | * | ||
117 | * CPU0 - waker CPU1 - waiter | ||
118 | * | ||
119 | * for (;;) { | ||
120 | * @cond = true; prepare_to_wait(&wq, &wait, state); | ||
121 | * smp_mb(); // smp_mb() from set_current_state() | ||
122 | * if (waitqueue_active(wq)) if (@cond) | ||
123 | * wake_up(wq); break; | ||
124 | * schedule(); | ||
125 | * } | ||
126 | * finish_wait(&wq, &wait); | ||
127 | * | ||
128 | * Because without the explicit smp_mb() it's possible for the | ||
129 | * waitqueue_active() load to get hoisted over the @cond store such that we'll | ||
130 | * observe an empty wait list while the waiter might not observe @cond. | ||
131 | * | ||
132 | * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), | ||
133 | * which (when the lock is uncontended) are of roughly equal cost. | ||
134 | */ | ||
105 | static inline int waitqueue_active(wait_queue_head_t *q) | 135 | static inline int waitqueue_active(wait_queue_head_t *q) |
106 | { | 136 | { |
107 | return !list_empty(&q->task_list); | 137 | return !list_empty(&q->task_list); |
diff --git a/kernel/fork.c b/kernel/fork.c index 1155eac61687..291b08cc817b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1349,9 +1349,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1349 | prev_cputime_init(&p->prev_cputime); | 1349 | prev_cputime_init(&p->prev_cputime); |
1350 | 1350 | ||
1351 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1351 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
1352 | seqlock_init(&p->vtime_seqlock); | 1352 | seqcount_init(&p->vtime_seqcount); |
1353 | p->vtime_snap = 0; | 1353 | p->vtime_snap = 0; |
1354 | p->vtime_snap_whence = VTIME_SLEEPING; | 1354 | p->vtime_snap_whence = VTIME_INACTIVE; |
1355 | #endif | 1355 | #endif |
1356 | 1356 | ||
1357 | #if defined(SPLIT_RSS_COUNTING) | 1357 | #if defined(SPLIT_RSS_COUNTING) |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..a5d966cb8891 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
212 | ag = autogroup_task_get(p); | 212 | ag = autogroup_task_get(p); |
213 | 213 | ||
214 | down_write(&ag->lock); | 214 | down_write(&ag->lock); |
215 | err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); | 215 | err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); |
216 | if (!err) | 216 | if (!err) |
217 | ag->nice = nice; | 217 | ag->nice = nice; |
218 | up_write(&ag->lock); | 218 | up_write(&ag->lock); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34cb9f7fc2d2..77d97a6fc715 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void) | |||
731 | if (current->policy == SCHED_RR) { | 731 | if (current->policy == SCHED_RR) { |
732 | struct sched_rt_entity *rt_se = ¤t->rt; | 732 | struct sched_rt_entity *rt_se = ¤t->rt; |
733 | 733 | ||
734 | return rt_se->run_list.prev == rt_se->run_list.next; | 734 | return list_is_singular(&rt_se->run_list); |
735 | } | 735 | } |
736 | 736 | ||
737 | /* | 737 | /* |
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p) | |||
823 | return; | 823 | return; |
824 | } | 824 | } |
825 | 825 | ||
826 | load->weight = scale_load(prio_to_weight[prio]); | 826 | load->weight = scale_load(sched_prio_to_weight[prio]); |
827 | load->inv_weight = prio_to_wmult[prio]; | 827 | load->inv_weight = sched_prio_to_wmult[prio]; |
828 | } | 828 | } |
829 | 829 | ||
830 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 830 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new | |||
1071 | { | 1071 | { |
1072 | lockdep_assert_held(&rq->lock); | 1072 | lockdep_assert_held(&rq->lock); |
1073 | 1073 | ||
1074 | dequeue_task(rq, p, 0); | ||
1075 | p->on_rq = TASK_ON_RQ_MIGRATING; | 1074 | p->on_rq = TASK_ON_RQ_MIGRATING; |
1075 | dequeue_task(rq, p, 0); | ||
1076 | set_task_cpu(p, new_cpu); | 1076 | set_task_cpu(p, new_cpu); |
1077 | raw_spin_unlock(&rq->lock); | 1077 | raw_spin_unlock(&rq->lock); |
1078 | 1078 | ||
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new | |||
1080 | 1080 | ||
1081 | raw_spin_lock(&rq->lock); | 1081 | raw_spin_lock(&rq->lock); |
1082 | BUG_ON(task_cpu(p) != new_cpu); | 1082 | BUG_ON(task_cpu(p) != new_cpu); |
1083 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
1084 | enqueue_task(rq, p, 0); | 1083 | enqueue_task(rq, p, 0); |
1084 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
1085 | check_preempt_curr(rq, p, 0); | 1085 | check_preempt_curr(rq, p, 0); |
1086 | 1086 | ||
1087 | return rq; | 1087 | return rq; |
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1274 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 1274 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
1275 | !p->on_rq); | 1275 | !p->on_rq); |
1276 | 1276 | ||
1277 | /* | ||
1278 | * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, | ||
1279 | * because schedstat_wait_{start,end} rebase migrating task's wait_start | ||
1280 | * time relying on p->on_rq. | ||
1281 | */ | ||
1282 | WARN_ON_ONCE(p->state == TASK_RUNNING && | ||
1283 | p->sched_class == &fair_sched_class && | ||
1284 | (p->on_rq && !task_on_rq_migrating(p))); | ||
1285 | |||
1277 | #ifdef CONFIG_LOCKDEP | 1286 | #ifdef CONFIG_LOCKDEP |
1278 | /* | 1287 | /* |
1279 | * The caller should hold either p->pi_lock or rq->lock, when changing | 1288 | * The caller should hold either p->pi_lock or rq->lock, when changing |
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1310 | src_rq = task_rq(p); | 1319 | src_rq = task_rq(p); |
1311 | dst_rq = cpu_rq(cpu); | 1320 | dst_rq = cpu_rq(cpu); |
1312 | 1321 | ||
1322 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
1313 | deactivate_task(src_rq, p, 0); | 1323 | deactivate_task(src_rq, p, 0); |
1314 | set_task_cpu(p, cpu); | 1324 | set_task_cpu(p, cpu); |
1315 | activate_task(dst_rq, p, 0); | 1325 | activate_task(dst_rq, p, 0); |
1326 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
1316 | check_preempt_curr(dst_rq, p, 0); | 1327 | check_preempt_curr(dst_rq, p, 0); |
1317 | } else { | 1328 | } else { |
1318 | /* | 1329 | /* |
@@ -2194,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2194 | p->se.vruntime = 0; | 2205 | p->se.vruntime = 0; |
2195 | INIT_LIST_HEAD(&p->se.group_node); | 2206 | INIT_LIST_HEAD(&p->se.group_node); |
2196 | 2207 | ||
2208 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
2209 | p->se.cfs_rq = NULL; | ||
2210 | #endif | ||
2211 | |||
2197 | #ifdef CONFIG_SCHEDSTATS | 2212 | #ifdef CONFIG_SCHEDSTATS |
2198 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2213 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2199 | #endif | 2214 | #endif |
@@ -7442,6 +7457,9 @@ int in_sched_functions(unsigned long addr) | |||
7442 | */ | 7457 | */ |
7443 | struct task_group root_task_group; | 7458 | struct task_group root_task_group; |
7444 | LIST_HEAD(task_groups); | 7459 | LIST_HEAD(task_groups); |
7460 | |||
7461 | /* Cacheline aligned slab cache for task_group */ | ||
7462 | static struct kmem_cache *task_group_cache __read_mostly; | ||
7445 | #endif | 7463 | #endif |
7446 | 7464 | ||
7447 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); | 7465 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
@@ -7499,11 +7517,12 @@ void __init sched_init(void) | |||
7499 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7517 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7500 | 7518 | ||
7501 | #ifdef CONFIG_CGROUP_SCHED | 7519 | #ifdef CONFIG_CGROUP_SCHED |
7520 | task_group_cache = KMEM_CACHE(task_group, 0); | ||
7521 | |||
7502 | list_add(&root_task_group.list, &task_groups); | 7522 | list_add(&root_task_group.list, &task_groups); |
7503 | INIT_LIST_HEAD(&root_task_group.children); | 7523 | INIT_LIST_HEAD(&root_task_group.children); |
7504 | INIT_LIST_HEAD(&root_task_group.siblings); | 7524 | INIT_LIST_HEAD(&root_task_group.siblings); |
7505 | autogroup_init(&init_task); | 7525 | autogroup_init(&init_task); |
7506 | |||
7507 | #endif /* CONFIG_CGROUP_SCHED */ | 7526 | #endif /* CONFIG_CGROUP_SCHED */ |
7508 | 7527 | ||
7509 | for_each_possible_cpu(i) { | 7528 | for_each_possible_cpu(i) { |
@@ -7784,7 +7803,7 @@ static void free_sched_group(struct task_group *tg) | |||
7784 | free_fair_sched_group(tg); | 7803 | free_fair_sched_group(tg); |
7785 | free_rt_sched_group(tg); | 7804 | free_rt_sched_group(tg); |
7786 | autogroup_free(tg); | 7805 | autogroup_free(tg); |
7787 | kfree(tg); | 7806 | kmem_cache_free(task_group_cache, tg); |
7788 | } | 7807 | } |
7789 | 7808 | ||
7790 | /* allocate runqueue etc for a new task group */ | 7809 | /* allocate runqueue etc for a new task group */ |
@@ -7792,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
7792 | { | 7811 | { |
7793 | struct task_group *tg; | 7812 | struct task_group *tg; |
7794 | 7813 | ||
7795 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 7814 | tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); |
7796 | if (!tg) | 7815 | if (!tg) |
7797 | return ERR_PTR(-ENOMEM); | 7816 | return ERR_PTR(-ENOMEM); |
7798 | 7817 | ||
@@ -8697,3 +8716,44 @@ void dump_cpu_task(int cpu) | |||
8697 | pr_info("Task dump for CPU %d:\n", cpu); | 8716 | pr_info("Task dump for CPU %d:\n", cpu); |
8698 | sched_show_task(cpu_curr(cpu)); | 8717 | sched_show_task(cpu_curr(cpu)); |
8699 | } | 8718 | } |
8719 | |||
8720 | /* | ||
8721 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
8722 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
8723 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
8724 | * that remained on nice 0. | ||
8725 | * | ||
8726 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
8727 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
8728 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
8729 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
8730 | * the relative distance between them is ~25%.) | ||
8731 | */ | ||
8732 | const int sched_prio_to_weight[40] = { | ||
8733 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
8734 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
8735 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
8736 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
8737 | /* 0 */ 1024, 820, 655, 526, 423, | ||
8738 | /* 5 */ 335, 272, 215, 172, 137, | ||
8739 | /* 10 */ 110, 87, 70, 56, 45, | ||
8740 | /* 15 */ 36, 29, 23, 18, 15, | ||
8741 | }; | ||
8742 | |||
8743 | /* | ||
8744 | * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. | ||
8745 | * | ||
8746 | * In cases where the weight does not change often, we can use the | ||
8747 | * precalculated inverse to speed up arithmetics by turning divisions | ||
8748 | * into multiplications: | ||
8749 | */ | ||
8750 | const u32 sched_prio_to_wmult[40] = { | ||
8751 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
8752 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
8753 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
8754 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
8755 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
8756 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
8757 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
8758 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
8759 | }; | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 05de80b48586..d5ff5c6bf829 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -466,7 +466,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
466 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 466 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
467 | struct rq *rq = this_rq(); | 467 | struct rq *rq = this_rq(); |
468 | 468 | ||
469 | if (vtime_accounting_enabled()) | 469 | if (vtime_accounting_cpu_enabled()) |
470 | return; | 470 | return; |
471 | 471 | ||
472 | if (sched_clock_irqtime) { | 472 | if (sched_clock_irqtime) { |
@@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
680 | { | 680 | { |
681 | unsigned long long delta = vtime_delta(tsk); | 681 | unsigned long long delta = vtime_delta(tsk); |
682 | 682 | ||
683 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); | 683 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
684 | tsk->vtime_snap += delta; | 684 | tsk->vtime_snap += delta; |
685 | 685 | ||
686 | /* CHECKME: always safe to convert nsecs to cputime? */ | 686 | /* CHECKME: always safe to convert nsecs to cputime? */ |
@@ -696,37 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
696 | 696 | ||
697 | void vtime_account_system(struct task_struct *tsk) | 697 | void vtime_account_system(struct task_struct *tsk) |
698 | { | 698 | { |
699 | write_seqlock(&tsk->vtime_seqlock); | 699 | write_seqcount_begin(&tsk->vtime_seqcount); |
700 | __vtime_account_system(tsk); | 700 | __vtime_account_system(tsk); |
701 | write_sequnlock(&tsk->vtime_seqlock); | 701 | write_seqcount_end(&tsk->vtime_seqcount); |
702 | } | 702 | } |
703 | 703 | ||
704 | void vtime_gen_account_irq_exit(struct task_struct *tsk) | 704 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
705 | { | 705 | { |
706 | write_seqlock(&tsk->vtime_seqlock); | 706 | write_seqcount_begin(&tsk->vtime_seqcount); |
707 | __vtime_account_system(tsk); | 707 | __vtime_account_system(tsk); |
708 | if (context_tracking_in_user()) | 708 | if (context_tracking_in_user()) |
709 | tsk->vtime_snap_whence = VTIME_USER; | 709 | tsk->vtime_snap_whence = VTIME_USER; |
710 | write_sequnlock(&tsk->vtime_seqlock); | 710 | write_seqcount_end(&tsk->vtime_seqcount); |
711 | } | 711 | } |
712 | 712 | ||
713 | void vtime_account_user(struct task_struct *tsk) | 713 | void vtime_account_user(struct task_struct *tsk) |
714 | { | 714 | { |
715 | cputime_t delta_cpu; | 715 | cputime_t delta_cpu; |
716 | 716 | ||
717 | write_seqlock(&tsk->vtime_seqlock); | 717 | write_seqcount_begin(&tsk->vtime_seqcount); |
718 | delta_cpu = get_vtime_delta(tsk); | 718 | delta_cpu = get_vtime_delta(tsk); |
719 | tsk->vtime_snap_whence = VTIME_SYS; | 719 | tsk->vtime_snap_whence = VTIME_SYS; |
720 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 720 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); |
721 | write_sequnlock(&tsk->vtime_seqlock); | 721 | write_seqcount_end(&tsk->vtime_seqcount); |
722 | } | 722 | } |
723 | 723 | ||
724 | void vtime_user_enter(struct task_struct *tsk) | 724 | void vtime_user_enter(struct task_struct *tsk) |
725 | { | 725 | { |
726 | write_seqlock(&tsk->vtime_seqlock); | 726 | write_seqcount_begin(&tsk->vtime_seqcount); |
727 | __vtime_account_system(tsk); | 727 | __vtime_account_system(tsk); |
728 | tsk->vtime_snap_whence = VTIME_USER; | 728 | tsk->vtime_snap_whence = VTIME_USER; |
729 | write_sequnlock(&tsk->vtime_seqlock); | 729 | write_seqcount_end(&tsk->vtime_seqcount); |
730 | } | 730 | } |
731 | 731 | ||
732 | void vtime_guest_enter(struct task_struct *tsk) | 732 | void vtime_guest_enter(struct task_struct *tsk) |
@@ -738,19 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk) | |||
738 | * synchronization against the reader (task_gtime()) | 738 | * synchronization against the reader (task_gtime()) |
739 | * that can thus safely catch up with a tickless delta. | 739 | * that can thus safely catch up with a tickless delta. |
740 | */ | 740 | */ |
741 | write_seqlock(&tsk->vtime_seqlock); | 741 | write_seqcount_begin(&tsk->vtime_seqcount); |
742 | __vtime_account_system(tsk); | 742 | __vtime_account_system(tsk); |
743 | current->flags |= PF_VCPU; | 743 | current->flags |= PF_VCPU; |
744 | write_sequnlock(&tsk->vtime_seqlock); | 744 | write_seqcount_end(&tsk->vtime_seqcount); |
745 | } | 745 | } |
746 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | 746 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
747 | 747 | ||
748 | void vtime_guest_exit(struct task_struct *tsk) | 748 | void vtime_guest_exit(struct task_struct *tsk) |
749 | { | 749 | { |
750 | write_seqlock(&tsk->vtime_seqlock); | 750 | write_seqcount_begin(&tsk->vtime_seqcount); |
751 | __vtime_account_system(tsk); | 751 | __vtime_account_system(tsk); |
752 | current->flags &= ~PF_VCPU; | 752 | current->flags &= ~PF_VCPU; |
753 | write_sequnlock(&tsk->vtime_seqlock); | 753 | write_seqcount_end(&tsk->vtime_seqcount); |
754 | } | 754 | } |
755 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | 755 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
756 | 756 | ||
@@ -763,24 +763,26 @@ void vtime_account_idle(struct task_struct *tsk) | |||
763 | 763 | ||
764 | void arch_vtime_task_switch(struct task_struct *prev) | 764 | void arch_vtime_task_switch(struct task_struct *prev) |
765 | { | 765 | { |
766 | write_seqlock(&prev->vtime_seqlock); | 766 | write_seqcount_begin(&prev->vtime_seqcount); |
767 | prev->vtime_snap_whence = VTIME_SLEEPING; | 767 | prev->vtime_snap_whence = VTIME_INACTIVE; |
768 | write_sequnlock(&prev->vtime_seqlock); | 768 | write_seqcount_end(&prev->vtime_seqcount); |
769 | 769 | ||
770 | write_seqlock(¤t->vtime_seqlock); | 770 | write_seqcount_begin(¤t->vtime_seqcount); |
771 | current->vtime_snap_whence = VTIME_SYS; | 771 | current->vtime_snap_whence = VTIME_SYS; |
772 | current->vtime_snap = sched_clock_cpu(smp_processor_id()); | 772 | current->vtime_snap = sched_clock_cpu(smp_processor_id()); |
773 | write_sequnlock(¤t->vtime_seqlock); | 773 | write_seqcount_end(¤t->vtime_seqcount); |
774 | } | 774 | } |
775 | 775 | ||
776 | void vtime_init_idle(struct task_struct *t, int cpu) | 776 | void vtime_init_idle(struct task_struct *t, int cpu) |
777 | { | 777 | { |
778 | unsigned long flags; | 778 | unsigned long flags; |
779 | 779 | ||
780 | write_seqlock_irqsave(&t->vtime_seqlock, flags); | 780 | local_irq_save(flags); |
781 | write_seqcount_begin(&t->vtime_seqcount); | ||
781 | t->vtime_snap_whence = VTIME_SYS; | 782 | t->vtime_snap_whence = VTIME_SYS; |
782 | t->vtime_snap = sched_clock_cpu(cpu); | 783 | t->vtime_snap = sched_clock_cpu(cpu); |
783 | write_sequnlock_irqrestore(&t->vtime_seqlock, flags); | 784 | write_seqcount_end(&t->vtime_seqcount); |
785 | local_irq_restore(flags); | ||
784 | } | 786 | } |
785 | 787 | ||
786 | cputime_t task_gtime(struct task_struct *t) | 788 | cputime_t task_gtime(struct task_struct *t) |
@@ -788,17 +790,17 @@ cputime_t task_gtime(struct task_struct *t) | |||
788 | unsigned int seq; | 790 | unsigned int seq; |
789 | cputime_t gtime; | 791 | cputime_t gtime; |
790 | 792 | ||
791 | if (!context_tracking_is_enabled()) | 793 | if (!vtime_accounting_enabled()) |
792 | return t->gtime; | 794 | return t->gtime; |
793 | 795 | ||
794 | do { | 796 | do { |
795 | seq = read_seqbegin(&t->vtime_seqlock); | 797 | seq = read_seqcount_begin(&t->vtime_seqcount); |
796 | 798 | ||
797 | gtime = t->gtime; | 799 | gtime = t->gtime; |
798 | if (t->flags & PF_VCPU) | 800 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) |
799 | gtime += vtime_delta(t); | 801 | gtime += vtime_delta(t); |
800 | 802 | ||
801 | } while (read_seqretry(&t->vtime_seqlock, seq)); | 803 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
802 | 804 | ||
803 | return gtime; | 805 | return gtime; |
804 | } | 806 | } |
@@ -821,7 +823,7 @@ fetch_task_cputime(struct task_struct *t, | |||
821 | *udelta = 0; | 823 | *udelta = 0; |
822 | *sdelta = 0; | 824 | *sdelta = 0; |
823 | 825 | ||
824 | seq = read_seqbegin(&t->vtime_seqlock); | 826 | seq = read_seqcount_begin(&t->vtime_seqcount); |
825 | 827 | ||
826 | if (u_dst) | 828 | if (u_dst) |
827 | *u_dst = *u_src; | 829 | *u_dst = *u_src; |
@@ -829,7 +831,7 @@ fetch_task_cputime(struct task_struct *t, | |||
829 | *s_dst = *s_src; | 831 | *s_dst = *s_src; |
830 | 832 | ||
831 | /* Task is sleeping, nothing to add */ | 833 | /* Task is sleeping, nothing to add */ |
832 | if (t->vtime_snap_whence == VTIME_SLEEPING || | 834 | if (t->vtime_snap_whence == VTIME_INACTIVE || |
833 | is_idle_task(t)) | 835 | is_idle_task(t)) |
834 | continue; | 836 | continue; |
835 | 837 | ||
@@ -845,7 +847,7 @@ fetch_task_cputime(struct task_struct *t, | |||
845 | if (t->vtime_snap_whence == VTIME_SYS) | 847 | if (t->vtime_snap_whence == VTIME_SYS) |
846 | *sdelta = delta; | 848 | *sdelta = delta; |
847 | } | 849 | } |
848 | } while (read_seqretry(&t->vtime_seqlock, seq)); | 850 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
849 | } | 851 | } |
850 | 852 | ||
851 | 853 | ||
@@ -853,6 +855,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | |||
853 | { | 855 | { |
854 | cputime_t udelta, sdelta; | 856 | cputime_t udelta, sdelta; |
855 | 857 | ||
858 | if (!vtime_accounting_enabled()) { | ||
859 | if (utime) | ||
860 | *utime = t->utime; | ||
861 | if (stime) | ||
862 | *stime = t->stime; | ||
863 | return; | ||
864 | } | ||
865 | |||
856 | fetch_task_cputime(t, utime, stime, &t->utime, | 866 | fetch_task_cputime(t, utime, stime, &t->utime, |
857 | &t->stime, &udelta, &sdelta); | 867 | &t->stime, &udelta, &sdelta); |
858 | if (utime) | 868 | if (utime) |
@@ -866,6 +876,14 @@ void task_cputime_scaled(struct task_struct *t, | |||
866 | { | 876 | { |
867 | cputime_t udelta, sdelta; | 877 | cputime_t udelta, sdelta; |
868 | 878 | ||
879 | if (!vtime_accounting_enabled()) { | ||
880 | if (utimescaled) | ||
881 | *utimescaled = t->utimescaled; | ||
882 | if (stimescaled) | ||
883 | *stimescaled = t->stimescaled; | ||
884 | return; | ||
885 | } | ||
886 | |||
869 | fetch_task_cputime(t, utimescaled, stimescaled, | 887 | fetch_task_cputime(t, utimescaled, stimescaled, |
870 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); | 888 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); |
871 | if (utimescaled) | 889 | if (utimescaled) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..cd64c979d0e1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | |||
176 | } | 176 | } |
177 | } | 177 | } |
178 | 178 | ||
179 | if (leftmost) | 179 | if (leftmost) { |
180 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; | 180 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; |
181 | dl_rq->earliest_dl.next = p->dl.deadline; | ||
182 | } | ||
181 | 183 | ||
182 | rb_link_node(&p->pushable_dl_tasks, parent, link); | 184 | rb_link_node(&p->pushable_dl_tasks, parent, link); |
183 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | 185 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); |
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | |||
195 | 197 | ||
196 | next_node = rb_next(&p->pushable_dl_tasks); | 198 | next_node = rb_next(&p->pushable_dl_tasks); |
197 | dl_rq->pushable_dl_tasks_leftmost = next_node; | 199 | dl_rq->pushable_dl_tasks_leftmost = next_node; |
200 | if (next_node) { | ||
201 | dl_rq->earliest_dl.next = rb_entry(next_node, | ||
202 | struct task_struct, pushable_dl_tasks)->dl.deadline; | ||
203 | } | ||
198 | } | 204 | } |
199 | 205 | ||
200 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | 206 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); |
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq) | |||
782 | 788 | ||
783 | #ifdef CONFIG_SMP | 789 | #ifdef CONFIG_SMP |
784 | 790 | ||
785 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); | ||
786 | |||
787 | static inline u64 next_deadline(struct rq *rq) | ||
788 | { | ||
789 | struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); | ||
790 | |||
791 | if (next && dl_prio(next->prio)) | ||
792 | return next->dl.deadline; | ||
793 | else | ||
794 | return 0; | ||
795 | } | ||
796 | |||
797 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | 791 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) |
798 | { | 792 | { |
799 | struct rq *rq = rq_of_dl_rq(dl_rq); | 793 | struct rq *rq = rq_of_dl_rq(dl_rq); |
800 | 794 | ||
801 | if (dl_rq->earliest_dl.curr == 0 || | 795 | if (dl_rq->earliest_dl.curr == 0 || |
802 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | 796 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { |
803 | /* | ||
804 | * If the dl_rq had no -deadline tasks, or if the new task | ||
805 | * has shorter deadline than the current one on dl_rq, we | ||
806 | * know that the previous earliest becomes our next earliest, | ||
807 | * as the new task becomes the earliest itself. | ||
808 | */ | ||
809 | dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; | ||
810 | dl_rq->earliest_dl.curr = deadline; | 797 | dl_rq->earliest_dl.curr = deadline; |
811 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | 798 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); |
812 | } else if (dl_rq->earliest_dl.next == 0 || | ||
813 | dl_time_before(deadline, dl_rq->earliest_dl.next)) { | ||
814 | /* | ||
815 | * On the other hand, if the new -deadline task has a | ||
816 | * a later deadline than the earliest one on dl_rq, but | ||
817 | * it is earlier than the next (if any), we must | ||
818 | * recompute the next-earliest. | ||
819 | */ | ||
820 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
821 | } | 799 | } |
822 | } | 800 | } |
823 | 801 | ||
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
839 | 817 | ||
840 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | 818 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); |
841 | dl_rq->earliest_dl.curr = entry->deadline; | 819 | dl_rq->earliest_dl.curr = entry->deadline; |
842 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
843 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | 820 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); |
844 | } | 821 | } |
845 | } | 822 | } |
@@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1274 | return 0; | 1251 | return 0; |
1275 | } | 1252 | } |
1276 | 1253 | ||
1277 | /* Returns the second earliest -deadline task, NULL otherwise */ | ||
1278 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) | ||
1279 | { | ||
1280 | struct rb_node *next_node = rq->dl.rb_leftmost; | ||
1281 | struct sched_dl_entity *dl_se; | ||
1282 | struct task_struct *p = NULL; | ||
1283 | |||
1284 | next_node: | ||
1285 | next_node = rb_next(next_node); | ||
1286 | if (next_node) { | ||
1287 | dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); | ||
1288 | p = dl_task_of(dl_se); | ||
1289 | |||
1290 | if (pick_dl_task(rq, p, cpu)) | ||
1291 | return p; | ||
1292 | |||
1293 | goto next_node; | ||
1294 | } | ||
1295 | |||
1296 | return NULL; | ||
1297 | } | ||
1298 | |||
1299 | /* | 1254 | /* |
1300 | * Return the earliest pushable rq's task, which is suitable to be executed | 1255 | * Return the earliest pushable rq's task, which is suitable to be executed |
1301 | * on the CPU, NULL otherwise: | 1256 | * on the CPU, NULL otherwise: |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..1926606ece80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq) | |||
738 | update_curr(cfs_rq_of(&rq->curr->se)); | 738 | update_curr(cfs_rq_of(&rq->curr->se)); |
739 | } | 739 | } |
740 | 740 | ||
741 | #ifdef CONFIG_SCHEDSTATS | ||
742 | static inline void | ||
743 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
744 | { | ||
745 | u64 wait_start = rq_clock(rq_of(cfs_rq)); | ||
746 | |||
747 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && | ||
748 | likely(wait_start > se->statistics.wait_start)) | ||
749 | wait_start -= se->statistics.wait_start; | ||
750 | |||
751 | se->statistics.wait_start = wait_start; | ||
752 | } | ||
753 | |||
754 | static void | ||
755 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
756 | { | ||
757 | struct task_struct *p; | ||
758 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | ||
759 | |||
760 | if (entity_is_task(se)) { | ||
761 | p = task_of(se); | ||
762 | if (task_on_rq_migrating(p)) { | ||
763 | /* | ||
764 | * Preserve migrating task's wait time so wait_start | ||
765 | * time stamp can be adjusted to accumulate wait time | ||
766 | * prior to migration. | ||
767 | */ | ||
768 | se->statistics.wait_start = delta; | ||
769 | return; | ||
770 | } | ||
771 | trace_sched_stat_wait(p, delta); | ||
772 | } | ||
773 | |||
774 | se->statistics.wait_max = max(se->statistics.wait_max, delta); | ||
775 | se->statistics.wait_count++; | ||
776 | se->statistics.wait_sum += delta; | ||
777 | se->statistics.wait_start = 0; | ||
778 | } | ||
779 | #else | ||
741 | static inline void | 780 | static inline void |
742 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 781 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
743 | { | 782 | { |
744 | schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); | ||
745 | } | 783 | } |
746 | 784 | ||
785 | static inline void | ||
786 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
787 | { | ||
788 | } | ||
789 | #endif | ||
790 | |||
747 | /* | 791 | /* |
748 | * Task is being enqueued - update stats: | 792 | * Task is being enqueued - update stats: |
749 | */ | 793 | */ |
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
757 | update_stats_wait_start(cfs_rq, se); | 801 | update_stats_wait_start(cfs_rq, se); |
758 | } | 802 | } |
759 | 803 | ||
760 | static void | ||
761 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
762 | { | ||
763 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, | ||
764 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); | ||
765 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); | ||
766 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + | ||
767 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); | ||
768 | #ifdef CONFIG_SCHEDSTATS | ||
769 | if (entity_is_task(se)) { | ||
770 | trace_sched_stat_wait(task_of(se), | ||
771 | rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); | ||
772 | } | ||
773 | #endif | ||
774 | schedstat_set(se->statistics.wait_start, 0); | ||
775 | } | ||
776 | |||
777 | static inline void | 804 | static inline void |
778 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 805 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
779 | { | 806 | { |
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work) | |||
2155 | unsigned long migrate, next_scan, now = jiffies; | 2182 | unsigned long migrate, next_scan, now = jiffies; |
2156 | struct task_struct *p = current; | 2183 | struct task_struct *p = current; |
2157 | struct mm_struct *mm = p->mm; | 2184 | struct mm_struct *mm = p->mm; |
2185 | u64 runtime = p->se.sum_exec_runtime; | ||
2158 | struct vm_area_struct *vma; | 2186 | struct vm_area_struct *vma; |
2159 | unsigned long start, end; | 2187 | unsigned long start, end; |
2160 | unsigned long nr_pte_updates = 0; | 2188 | unsigned long nr_pte_updates = 0; |
@@ -2277,6 +2305,17 @@ out: | |||
2277 | else | 2305 | else |
2278 | reset_ptenuma_scan(p); | 2306 | reset_ptenuma_scan(p); |
2279 | up_read(&mm->mmap_sem); | 2307 | up_read(&mm->mmap_sem); |
2308 | |||
2309 | /* | ||
2310 | * Make sure tasks use at least 32x as much time to run other code | ||
2311 | * than they used here, to limit NUMA PTE scanning overhead to 3% max. | ||
2312 | * Usually update_task_scan_period slows down scanning enough; on an | ||
2313 | * overloaded system we need to limit overhead on a per task basis. | ||
2314 | */ | ||
2315 | if (unlikely(p->se.sum_exec_runtime != runtime)) { | ||
2316 | u64 diff = p->se.sum_exec_runtime - runtime; | ||
2317 | p->node_stamp += 32 * diff; | ||
2318 | } | ||
2280 | } | 2319 | } |
2281 | 2320 | ||
2282 | /* | 2321 | /* |
@@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
2670 | { | 2709 | { |
2671 | long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; | 2710 | long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; |
2672 | 2711 | ||
2712 | /* | ||
2713 | * No need to update load_avg for root_task_group as it is not used. | ||
2714 | */ | ||
2715 | if (cfs_rq->tg == &root_task_group) | ||
2716 | return; | ||
2717 | |||
2673 | if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { | 2718 | if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { |
2674 | atomic_long_add(delta, &cfs_rq->tg->load_avg); | 2719 | atomic_long_add(delta, &cfs_rq->tg->load_avg); |
2675 | cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; | 2720 | cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; |
2676 | } | 2721 | } |
2677 | } | 2722 | } |
2678 | 2723 | ||
2724 | /* | ||
2725 | * Called within set_task_rq() right before setting a task's cpu. The | ||
2726 | * caller only guarantees p->pi_lock is held; no other assumptions, | ||
2727 | * including the state of rq->lock, should be made. | ||
2728 | */ | ||
2729 | void set_task_rq_fair(struct sched_entity *se, | ||
2730 | struct cfs_rq *prev, struct cfs_rq *next) | ||
2731 | { | ||
2732 | if (!sched_feat(ATTACH_AGE_LOAD)) | ||
2733 | return; | ||
2734 | |||
2735 | /* | ||
2736 | * We are supposed to update the task to "current" time, then its up to | ||
2737 | * date and ready to go to new CPU/cfs_rq. But we have difficulty in | ||
2738 | * getting what current time is, so simply throw away the out-of-date | ||
2739 | * time. This will result in the wakee task is less decayed, but giving | ||
2740 | * the wakee more load sounds not bad. | ||
2741 | */ | ||
2742 | if (se->avg.last_update_time && prev) { | ||
2743 | u64 p_last_update_time; | ||
2744 | u64 n_last_update_time; | ||
2745 | |||
2746 | #ifndef CONFIG_64BIT | ||
2747 | u64 p_last_update_time_copy; | ||
2748 | u64 n_last_update_time_copy; | ||
2749 | |||
2750 | do { | ||
2751 | p_last_update_time_copy = prev->load_last_update_time_copy; | ||
2752 | n_last_update_time_copy = next->load_last_update_time_copy; | ||
2753 | |||
2754 | smp_rmb(); | ||
2755 | |||
2756 | p_last_update_time = prev->avg.last_update_time; | ||
2757 | n_last_update_time = next->avg.last_update_time; | ||
2758 | |||
2759 | } while (p_last_update_time != p_last_update_time_copy || | ||
2760 | n_last_update_time != n_last_update_time_copy); | ||
2761 | #else | ||
2762 | p_last_update_time = prev->avg.last_update_time; | ||
2763 | n_last_update_time = next->avg.last_update_time; | ||
2764 | #endif | ||
2765 | __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), | ||
2766 | &se->avg, 0, 0, NULL); | ||
2767 | se->avg.last_update_time = n_last_update_time; | ||
2768 | } | ||
2769 | } | ||
2679 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2770 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
2680 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 2771 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
2681 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2772 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -2809,48 +2900,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2809 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); | 2900 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); |
2810 | } | 2901 | } |
2811 | 2902 | ||
2812 | /* | ||
2813 | * Task first catches up with cfs_rq, and then subtract | ||
2814 | * itself from the cfs_rq (task must be off the queue now). | ||
2815 | */ | ||
2816 | void remove_entity_load_avg(struct sched_entity *se) | ||
2817 | { | ||
2818 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
2819 | u64 last_update_time; | ||
2820 | |||
2821 | #ifndef CONFIG_64BIT | 2903 | #ifndef CONFIG_64BIT |
2904 | static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) | ||
2905 | { | ||
2822 | u64 last_update_time_copy; | 2906 | u64 last_update_time_copy; |
2907 | u64 last_update_time; | ||
2823 | 2908 | ||
2824 | do { | 2909 | do { |
2825 | last_update_time_copy = cfs_rq->load_last_update_time_copy; | 2910 | last_update_time_copy = cfs_rq->load_last_update_time_copy; |
2826 | smp_rmb(); | 2911 | smp_rmb(); |
2827 | last_update_time = cfs_rq->avg.last_update_time; | 2912 | last_update_time = cfs_rq->avg.last_update_time; |
2828 | } while (last_update_time != last_update_time_copy); | 2913 | } while (last_update_time != last_update_time_copy); |
2829 | #else | ||
2830 | last_update_time = cfs_rq->avg.last_update_time; | ||
2831 | #endif | ||
2832 | 2914 | ||
2833 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | 2915 | return last_update_time; |
2834 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); | ||
2835 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); | ||
2836 | } | 2916 | } |
2837 | 2917 | #else | |
2838 | /* | 2918 | static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) |
2839 | * Update the rq's load with the elapsed running time before entering | ||
2840 | * idle. if the last scheduled task is not a CFS task, idle_enter will | ||
2841 | * be the only way to update the runnable statistic. | ||
2842 | */ | ||
2843 | void idle_enter_fair(struct rq *this_rq) | ||
2844 | { | 2919 | { |
2920 | return cfs_rq->avg.last_update_time; | ||
2845 | } | 2921 | } |
2922 | #endif | ||
2846 | 2923 | ||
2847 | /* | 2924 | /* |
2848 | * Update the rq's load with the elapsed idle time before a task is | 2925 | * Task first catches up with cfs_rq, and then subtract |
2849 | * scheduled. if the newly scheduled task is not a CFS task, idle_exit will | 2926 | * itself from the cfs_rq (task must be off the queue now). |
2850 | * be the only way to update the runnable statistic. | ||
2851 | */ | 2927 | */ |
2852 | void idle_exit_fair(struct rq *this_rq) | 2928 | void remove_entity_load_avg(struct sched_entity *se) |
2853 | { | 2929 | { |
2930 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
2931 | u64 last_update_time; | ||
2932 | |||
2933 | /* | ||
2934 | * Newly created task or never used group entity should not be removed | ||
2935 | * from its (source) cfs_rq | ||
2936 | */ | ||
2937 | if (se->avg.last_update_time == 0) | ||
2938 | return; | ||
2939 | |||
2940 | last_update_time = cfs_rq_last_update_time(cfs_rq); | ||
2941 | |||
2942 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | ||
2943 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); | ||
2944 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); | ||
2854 | } | 2945 | } |
2855 | 2946 | ||
2856 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) | 2947 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) |
@@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4240 | */ | 4331 | */ |
4241 | 4332 | ||
4242 | /* | 4333 | /* |
4243 | * The exact cpuload at various idx values, calculated at every tick would be | 4334 | * The exact cpuload calculated at every tick would be: |
4244 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 4335 | * |
4336 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | ||
4245 | * | 4337 | * |
4246 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | 4338 | * If a cpu misses updates for n ticks (as it was idle) and update gets |
4247 | * on nth tick when cpu may be busy, then we have: | 4339 | * called on the n+1-th tick when cpu may be busy, then we have: |
4248 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | 4340 | * |
4249 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | 4341 | * load_n = (1 - 1/2^i)^n * load_0 |
4342 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | ||
4250 | * | 4343 | * |
4251 | * decay_load_missed() below does efficient calculation of | 4344 | * decay_load_missed() below does efficient calculation of |
4252 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | 4345 | * |
4253 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | 4346 | * load' = (1 - 1/2^i)^n * load |
4347 | * | ||
4348 | * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. | ||
4349 | * This allows us to precompute the above in said factors, thereby allowing the | ||
4350 | * reduction of an arbitrary n in O(log_2 n) steps. (See also | ||
4351 | * fixed_power_int()) | ||
4254 | * | 4352 | * |
4255 | * The calculation is approximated on a 128 point scale. | 4353 | * The calculation is approximated on a 128 point scale. |
4256 | * degrade_zero_ticks is the number of ticks after which load at any | ||
4257 | * particular idx is approximated to be zero. | ||
4258 | * degrade_factor is a precomputed table, a row for each load idx. | ||
4259 | * Each column corresponds to degradation factor for a power of two ticks, | ||
4260 | * based on 128 point scale. | ||
4261 | * Example: | ||
4262 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
4263 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
4264 | * | ||
4265 | * With this power of 2 load factors, we can degrade the load n times | ||
4266 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
4267 | * n mult/shifts needed by the exact degradation. | ||
4268 | */ | 4354 | */ |
4269 | #define DEGRADE_SHIFT 7 | 4355 | #define DEGRADE_SHIFT 7 |
4270 | static const unsigned char | 4356 | |
4271 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | 4357 | static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; |
4272 | static const unsigned char | 4358 | static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { |
4273 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | 4359 | { 0, 0, 0, 0, 0, 0, 0, 0 }, |
4274 | {0, 0, 0, 0, 0, 0, 0, 0}, | 4360 | { 64, 32, 8, 0, 0, 0, 0, 0 }, |
4275 | {64, 32, 8, 0, 0, 0, 0, 0}, | 4361 | { 96, 72, 40, 12, 1, 0, 0, 0 }, |
4276 | {96, 72, 40, 12, 1, 0, 0}, | 4362 | { 112, 98, 75, 43, 15, 1, 0, 0 }, |
4277 | {112, 98, 75, 43, 15, 1, 0}, | 4363 | { 120, 112, 98, 76, 45, 16, 2, 0 } |
4278 | {120, 112, 98, 76, 45, 16, 2} }; | 4364 | }; |
4279 | 4365 | ||
4280 | /* | 4366 | /* |
4281 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | 4367 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog |
@@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
4306 | return load; | 4392 | return load; |
4307 | } | 4393 | } |
4308 | 4394 | ||
4309 | /* | 4395 | /** |
4396 | * __update_cpu_load - update the rq->cpu_load[] statistics | ||
4397 | * @this_rq: The rq to update statistics for | ||
4398 | * @this_load: The current load | ||
4399 | * @pending_updates: The number of missed updates | ||
4400 | * @active: !0 for NOHZ_FULL | ||
4401 | * | ||
4310 | * Update rq->cpu_load[] statistics. This function is usually called every | 4402 | * Update rq->cpu_load[] statistics. This function is usually called every |
4311 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 4403 | * scheduler tick (TICK_NSEC). |
4312 | * every tick. We fix it up based on jiffies. | 4404 | * |
4405 | * This function computes a decaying average: | ||
4406 | * | ||
4407 | * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load | ||
4408 | * | ||
4409 | * Because of NOHZ it might not get called on every tick which gives need for | ||
4410 | * the @pending_updates argument. | ||
4411 | * | ||
4412 | * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 | ||
4413 | * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load | ||
4414 | * = A * (A * load[i]_n-2 + B) + B | ||
4415 | * = A * (A * (A * load[i]_n-3 + B) + B) + B | ||
4416 | * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B | ||
4417 | * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B | ||
4418 | * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B | ||
4419 | * = (1 - 1/2^i)^n * (load[i]_0 - load) + load | ||
4420 | * | ||
4421 | * In the above we've assumed load_n := load, which is true for NOHZ_FULL as | ||
4422 | * any change in load would have resulted in the tick being turned back on. | ||
4423 | * | ||
4424 | * For regular NOHZ, this reduces to: | ||
4425 | * | ||
4426 | * load[i]_n = (1 - 1/2^i)^n * load[i]_0 | ||
4427 | * | ||
4428 | * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra | ||
4429 | * term. See the @active paramter. | ||
4313 | */ | 4430 | */ |
4314 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | 4431 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
4315 | unsigned long pending_updates) | 4432 | unsigned long pending_updates, int active) |
4316 | { | 4433 | { |
4434 | unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; | ||
4317 | int i, scale; | 4435 | int i, scale; |
4318 | 4436 | ||
4319 | this_rq->nr_load_updates++; | 4437 | this_rq->nr_load_updates++; |
@@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
4325 | 4443 | ||
4326 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 4444 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
4327 | 4445 | ||
4328 | old_load = this_rq->cpu_load[i]; | 4446 | old_load = this_rq->cpu_load[i] - tickless_load; |
4329 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | 4447 | old_load = decay_load_missed(old_load, pending_updates - 1, i); |
4448 | old_load += tickless_load; | ||
4330 | new_load = this_load; | 4449 | new_load = this_load; |
4331 | /* | 4450 | /* |
4332 | * Round up the averaging division if load is increasing. This | 4451 | * Round up the averaging division if load is increasing. This |
@@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq) | |||
4381 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 4500 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; |
4382 | this_rq->last_load_update_tick = curr_jiffies; | 4501 | this_rq->last_load_update_tick = curr_jiffies; |
4383 | 4502 | ||
4384 | __update_cpu_load(this_rq, load, pending_updates); | 4503 | __update_cpu_load(this_rq, load, pending_updates, 0); |
4385 | } | 4504 | } |
4386 | 4505 | ||
4387 | /* | 4506 | /* |
4388 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | 4507 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. |
4389 | */ | 4508 | */ |
4390 | void update_cpu_load_nohz(void) | 4509 | void update_cpu_load_nohz(int active) |
4391 | { | 4510 | { |
4392 | struct rq *this_rq = this_rq(); | 4511 | struct rq *this_rq = this_rq(); |
4393 | unsigned long curr_jiffies = READ_ONCE(jiffies); | 4512 | unsigned long curr_jiffies = READ_ONCE(jiffies); |
4513 | unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; | ||
4394 | unsigned long pending_updates; | 4514 | unsigned long pending_updates; |
4395 | 4515 | ||
4396 | if (curr_jiffies == this_rq->last_load_update_tick) | 4516 | if (curr_jiffies == this_rq->last_load_update_tick) |
@@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void) | |||
4401 | if (pending_updates) { | 4521 | if (pending_updates) { |
4402 | this_rq->last_load_update_tick = curr_jiffies; | 4522 | this_rq->last_load_update_tick = curr_jiffies; |
4403 | /* | 4523 | /* |
4404 | * We were idle, this means load 0, the current load might be | 4524 | * In the regular NOHZ case, we were idle, this means load 0. |
4405 | * !0 due to remote wakeups and the sort. | 4525 | * In the NOHZ_FULL case, we were non-idle, we should consider |
4526 | * its weighted load. | ||
4406 | */ | 4527 | */ |
4407 | __update_cpu_load(this_rq, 0, pending_updates); | 4528 | __update_cpu_load(this_rq, load, pending_updates, active); |
4408 | } | 4529 | } |
4409 | raw_spin_unlock(&this_rq->lock); | 4530 | raw_spin_unlock(&this_rq->lock); |
4410 | } | 4531 | } |
@@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq) | |||
4420 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | 4541 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). |
4421 | */ | 4542 | */ |
4422 | this_rq->last_load_update_tick = jiffies; | 4543 | this_rq->last_load_update_tick = jiffies; |
4423 | __update_cpu_load(this_rq, load, 1); | 4544 | __update_cpu_load(this_rq, load, 1, 1); |
4424 | } | 4545 | } |
4425 | 4546 | ||
4426 | /* | 4547 | /* |
@@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5007 | /* | 5128 | /* |
5008 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 5129 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and |
5009 | * cfs_rq_of(p) references at time of call are still valid and identify the | 5130 | * cfs_rq_of(p) references at time of call are still valid and identify the |
5010 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 5131 | * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. |
5011 | * other assumptions, including the state of rq->lock, should be made. | ||
5012 | */ | 5132 | */ |
5013 | static void migrate_task_rq_fair(struct task_struct *p) | 5133 | static void migrate_task_rq_fair(struct task_struct *p) |
5014 | { | 5134 | { |
@@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) | |||
5721 | { | 5841 | { |
5722 | lockdep_assert_held(&env->src_rq->lock); | 5842 | lockdep_assert_held(&env->src_rq->lock); |
5723 | 5843 | ||
5724 | deactivate_task(env->src_rq, p, 0); | ||
5725 | p->on_rq = TASK_ON_RQ_MIGRATING; | 5844 | p->on_rq = TASK_ON_RQ_MIGRATING; |
5845 | deactivate_task(env->src_rq, p, 0); | ||
5726 | set_task_cpu(p, env->dst_cpu); | 5846 | set_task_cpu(p, env->dst_cpu); |
5727 | } | 5847 | } |
5728 | 5848 | ||
@@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) | |||
5855 | lockdep_assert_held(&rq->lock); | 5975 | lockdep_assert_held(&rq->lock); |
5856 | 5976 | ||
5857 | BUG_ON(task_rq(p) != rq); | 5977 | BUG_ON(task_rq(p) != rq); |
5858 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
5859 | activate_task(rq, p, 0); | 5978 | activate_task(rq, p, 0); |
5979 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
5860 | check_preempt_curr(rq, p, 0); | 5980 | check_preempt_curr(rq, p, 0); |
5861 | } | 5981 | } |
5862 | 5982 | ||
@@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6302 | bool *overload) | 6422 | bool *overload) |
6303 | { | 6423 | { |
6304 | unsigned long load; | 6424 | unsigned long load; |
6305 | int i; | 6425 | int i, nr_running; |
6306 | 6426 | ||
6307 | memset(sgs, 0, sizeof(*sgs)); | 6427 | memset(sgs, 0, sizeof(*sgs)); |
6308 | 6428 | ||
@@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6319 | sgs->group_util += cpu_util(i); | 6439 | sgs->group_util += cpu_util(i); |
6320 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6440 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6321 | 6441 | ||
6322 | if (rq->nr_running > 1) | 6442 | nr_running = rq->nr_running; |
6443 | if (nr_running > 1) | ||
6323 | *overload = true; | 6444 | *overload = true; |
6324 | 6445 | ||
6325 | #ifdef CONFIG_NUMA_BALANCING | 6446 | #ifdef CONFIG_NUMA_BALANCING |
@@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6327 | sgs->nr_preferred_running += rq->nr_preferred_running; | 6448 | sgs->nr_preferred_running += rq->nr_preferred_running; |
6328 | #endif | 6449 | #endif |
6329 | sgs->sum_weighted_load += weighted_cpuload(i); | 6450 | sgs->sum_weighted_load += weighted_cpuload(i); |
6330 | if (idle_cpu(i)) | 6451 | /* |
6452 | * No need to call idle_cpu() if nr_running is not 0 | ||
6453 | */ | ||
6454 | if (!nr_running && idle_cpu(i)) | ||
6331 | sgs->idle_cpus++; | 6455 | sgs->idle_cpus++; |
6332 | } | 6456 | } |
6333 | 6457 | ||
@@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq) | |||
7248 | int pulled_task = 0; | 7372 | int pulled_task = 0; |
7249 | u64 curr_cost = 0; | 7373 | u64 curr_cost = 0; |
7250 | 7374 | ||
7251 | idle_enter_fair(this_rq); | ||
7252 | |||
7253 | /* | 7375 | /* |
7254 | * We must set idle_stamp _before_ calling idle_balance(), such that we | 7376 | * We must set idle_stamp _before_ calling idle_balance(), such that we |
7255 | * measure the duration of idle_balance() as idle time. | 7377 | * measure the duration of idle_balance() as idle time. |
@@ -7330,10 +7452,8 @@ out: | |||
7330 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | 7452 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) |
7331 | pulled_task = -1; | 7453 | pulled_task = -1; |
7332 | 7454 | ||
7333 | if (pulled_task) { | 7455 | if (pulled_task) |
7334 | idle_exit_fair(this_rq); | ||
7335 | this_rq->idle_stamp = 0; | 7456 | this_rq->idle_stamp = 0; |
7336 | } | ||
7337 | 7457 | ||
7338 | return pulled_task; | 7458 | return pulled_task; |
7339 | } | 7459 | } |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..47ce94931f1b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | |||
47 | 47 | ||
48 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | 48 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) |
49 | { | 49 | { |
50 | idle_exit_fair(rq); | ||
51 | rq_last_tick_reset(rq); | 50 | rq_last_tick_reset(rq); |
52 | } | 51 | } |
53 | 52 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1e0bb4afe3fd..10f16374df7f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -248,7 +248,12 @@ struct task_group { | |||
248 | unsigned long shares; | 248 | unsigned long shares; |
249 | 249 | ||
250 | #ifdef CONFIG_SMP | 250 | #ifdef CONFIG_SMP |
251 | atomic_long_t load_avg; | 251 | /* |
252 | * load_avg can be heavily contended at clock tick time, so put | ||
253 | * it in its own cacheline separated from the fields above which | ||
254 | * will also be accessed at each tick. | ||
255 | */ | ||
256 | atomic_long_t load_avg ____cacheline_aligned; | ||
252 | #endif | 257 | #endif |
253 | #endif | 258 | #endif |
254 | 259 | ||
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk); | |||
335 | 340 | ||
336 | #ifdef CONFIG_FAIR_GROUP_SCHED | 341 | #ifdef CONFIG_FAIR_GROUP_SCHED |
337 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | 342 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); |
338 | #endif | 343 | |
344 | #ifdef CONFIG_SMP | ||
345 | extern void set_task_rq_fair(struct sched_entity *se, | ||
346 | struct cfs_rq *prev, struct cfs_rq *next); | ||
347 | #else /* !CONFIG_SMP */ | ||
348 | static inline void set_task_rq_fair(struct sched_entity *se, | ||
349 | struct cfs_rq *prev, struct cfs_rq *next) { } | ||
350 | #endif /* CONFIG_SMP */ | ||
351 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
339 | 352 | ||
340 | #else /* CONFIG_CGROUP_SCHED */ | 353 | #else /* CONFIG_CGROUP_SCHED */ |
341 | 354 | ||
@@ -933,6 +946,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
933 | #endif | 946 | #endif |
934 | 947 | ||
935 | #ifdef CONFIG_FAIR_GROUP_SCHED | 948 | #ifdef CONFIG_FAIR_GROUP_SCHED |
949 | set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); | ||
936 | p->se.cfs_rq = tg->cfs_rq[cpu]; | 950 | p->se.cfs_rq = tg->cfs_rq[cpu]; |
937 | p->se.parent = tg->se[cpu]; | 951 | p->se.parent = tg->se[cpu]; |
938 | #endif | 952 | #endif |
@@ -1113,46 +1127,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
1113 | #define WEIGHT_IDLEPRIO 3 | 1127 | #define WEIGHT_IDLEPRIO 3 |
1114 | #define WMULT_IDLEPRIO 1431655765 | 1128 | #define WMULT_IDLEPRIO 1431655765 |
1115 | 1129 | ||
1116 | /* | 1130 | extern const int sched_prio_to_weight[40]; |
1117 | * Nice levels are multiplicative, with a gentle 10% change for every | 1131 | extern const u32 sched_prio_to_wmult[40]; |
1118 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
1119 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
1120 | * that remained on nice 0. | ||
1121 | * | ||
1122 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
1123 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
1124 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
1125 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
1126 | * the relative distance between them is ~25%.) | ||
1127 | */ | ||
1128 | static const int prio_to_weight[40] = { | ||
1129 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
1130 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
1131 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
1132 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
1133 | /* 0 */ 1024, 820, 655, 526, 423, | ||
1134 | /* 5 */ 335, 272, 215, 172, 137, | ||
1135 | /* 10 */ 110, 87, 70, 56, 45, | ||
1136 | /* 15 */ 36, 29, 23, 18, 15, | ||
1137 | }; | ||
1138 | |||
1139 | /* | ||
1140 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
1141 | * | ||
1142 | * In cases where the weight does not change often, we can use the | ||
1143 | * precalculated inverse to speed up arithmetics by turning divisions | ||
1144 | * into multiplications: | ||
1145 | */ | ||
1146 | static const u32 prio_to_wmult[40] = { | ||
1147 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
1148 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
1149 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
1150 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
1151 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
1152 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
1153 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
1154 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
1155 | }; | ||
1156 | 1132 | ||
1157 | #define ENQUEUE_WAKEUP 0x01 | 1133 | #define ENQUEUE_WAKEUP 0x01 |
1158 | #define ENQUEUE_HEAD 0x02 | 1134 | #define ENQUEUE_HEAD 0x02 |
@@ -1252,16 +1228,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); | |||
1252 | 1228 | ||
1253 | extern void trigger_load_balance(struct rq *rq); | 1229 | extern void trigger_load_balance(struct rq *rq); |
1254 | 1230 | ||
1255 | extern void idle_enter_fair(struct rq *this_rq); | ||
1256 | extern void idle_exit_fair(struct rq *this_rq); | ||
1257 | |||
1258 | extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); | 1231 | extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); |
1259 | 1232 | ||
1260 | #else | ||
1261 | |||
1262 | static inline void idle_enter_fair(struct rq *rq) { } | ||
1263 | static inline void idle_exit_fair(struct rq *rq) { } | ||
1264 | |||
1265 | #endif | 1233 | #endif |
1266 | 1234 | ||
1267 | #ifdef CONFIG_CPU_IDLE | 1235 | #ifdef CONFIG_CPU_IDLE |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a3bbaee77c58..edb6de4f5908 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -28,7 +28,6 @@ | |||
28 | */ | 28 | */ |
29 | struct cpu_stop_done { | 29 | struct cpu_stop_done { |
30 | atomic_t nr_todo; /* nr left to execute */ | 30 | atomic_t nr_todo; /* nr left to execute */ |
31 | bool executed; /* actually executed? */ | ||
32 | int ret; /* collected return value */ | 31 | int ret; /* collected return value */ |
33 | struct completion completion; /* fired if nr_todo reaches 0 */ | 32 | struct completion completion; /* fired if nr_todo reaches 0 */ |
34 | }; | 33 | }; |
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | |||
63 | } | 62 | } |
64 | 63 | ||
65 | /* signal completion unless @done is NULL */ | 64 | /* signal completion unless @done is NULL */ |
66 | static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | 65 | static void cpu_stop_signal_done(struct cpu_stop_done *done) |
67 | { | 66 | { |
68 | if (done) { | 67 | if (atomic_dec_and_test(&done->nr_todo)) |
69 | if (executed) | 68 | complete(&done->completion); |
70 | done->executed = true; | ||
71 | if (atomic_dec_and_test(&done->nr_todo)) | ||
72 | complete(&done->completion); | ||
73 | } | ||
74 | } | 69 | } |
75 | 70 | ||
76 | static void __cpu_stop_queue_work(struct cpu_stopper *stopper, | 71 | static void __cpu_stop_queue_work(struct cpu_stopper *stopper, |
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper, | |||
81 | } | 76 | } |
82 | 77 | ||
83 | /* queue @work to @stopper. if offline, @work is completed immediately */ | 78 | /* queue @work to @stopper. if offline, @work is completed immediately */ |
84 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | 79 | static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) |
85 | { | 80 | { |
86 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 81 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
87 | unsigned long flags; | 82 | unsigned long flags; |
83 | bool enabled; | ||
88 | 84 | ||
89 | spin_lock_irqsave(&stopper->lock, flags); | 85 | spin_lock_irqsave(&stopper->lock, flags); |
90 | if (stopper->enabled) | 86 | enabled = stopper->enabled; |
87 | if (enabled) | ||
91 | __cpu_stop_queue_work(stopper, work); | 88 | __cpu_stop_queue_work(stopper, work); |
92 | else | 89 | else if (work->done) |
93 | cpu_stop_signal_done(work->done, false); | 90 | cpu_stop_signal_done(work->done); |
94 | spin_unlock_irqrestore(&stopper->lock, flags); | 91 | spin_unlock_irqrestore(&stopper->lock, flags); |
92 | |||
93 | return enabled; | ||
95 | } | 94 | } |
96 | 95 | ||
97 | /** | 96 | /** |
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
124 | struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; | 123 | struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; |
125 | 124 | ||
126 | cpu_stop_init_done(&done, 1); | 125 | cpu_stop_init_done(&done, 1); |
127 | cpu_stop_queue_work(cpu, &work); | 126 | if (!cpu_stop_queue_work(cpu, &work)) |
127 | return -ENOENT; | ||
128 | wait_for_completion(&done.completion); | 128 | wait_for_completion(&done.completion); |
129 | return done.executed ? done.ret : -ENOENT; | 129 | return done.ret; |
130 | } | 130 | } |
131 | 131 | ||
132 | /* This controls the threads on each CPU. */ | 132 | /* This controls the threads on each CPU. */ |
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
258 | struct cpu_stop_work work1, work2; | 258 | struct cpu_stop_work work1, work2; |
259 | struct multi_stop_data msdata; | 259 | struct multi_stop_data msdata; |
260 | 260 | ||
261 | preempt_disable(); | ||
262 | msdata = (struct multi_stop_data){ | 261 | msdata = (struct multi_stop_data){ |
263 | .fn = fn, | 262 | .fn = fn, |
264 | .data = arg, | 263 | .data = arg, |
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
277 | 276 | ||
278 | if (cpu1 > cpu2) | 277 | if (cpu1 > cpu2) |
279 | swap(cpu1, cpu2); | 278 | swap(cpu1, cpu2); |
280 | if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { | 279 | if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) |
281 | preempt_enable(); | ||
282 | return -ENOENT; | 280 | return -ENOENT; |
283 | } | ||
284 | |||
285 | preempt_enable(); | ||
286 | 281 | ||
287 | wait_for_completion(&done.completion); | 282 | wait_for_completion(&done.completion); |
288 | 283 | return done.ret; | |
289 | return done.executed ? done.ret : -ENOENT; | ||
290 | } | 284 | } |
291 | 285 | ||
292 | /** | 286 | /** |
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
302 | * | 296 | * |
303 | * CONTEXT: | 297 | * CONTEXT: |
304 | * Don't care. | 298 | * Don't care. |
299 | * | ||
300 | * RETURNS: | ||
301 | * true if cpu_stop_work was queued successfully and @fn will be called, | ||
302 | * false otherwise. | ||
305 | */ | 303 | */ |
306 | void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | 304 | bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, |
307 | struct cpu_stop_work *work_buf) | 305 | struct cpu_stop_work *work_buf) |
308 | { | 306 | { |
309 | *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; | 307 | *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; |
310 | cpu_stop_queue_work(cpu, work_buf); | 308 | return cpu_stop_queue_work(cpu, work_buf); |
311 | } | 309 | } |
312 | 310 | ||
313 | /* static data for stop_cpus */ | 311 | /* static data for stop_cpus */ |
314 | static DEFINE_MUTEX(stop_cpus_mutex); | 312 | static DEFINE_MUTEX(stop_cpus_mutex); |
315 | 313 | ||
316 | static void queue_stop_cpus_work(const struct cpumask *cpumask, | 314 | static bool queue_stop_cpus_work(const struct cpumask *cpumask, |
317 | cpu_stop_fn_t fn, void *arg, | 315 | cpu_stop_fn_t fn, void *arg, |
318 | struct cpu_stop_done *done) | 316 | struct cpu_stop_done *done) |
319 | { | 317 | { |
320 | struct cpu_stop_work *work; | 318 | struct cpu_stop_work *work; |
321 | unsigned int cpu; | 319 | unsigned int cpu; |
320 | bool queued = false; | ||
322 | 321 | ||
323 | /* | 322 | /* |
324 | * Disable preemption while queueing to avoid getting | 323 | * Disable preemption while queueing to avoid getting |
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
331 | work->fn = fn; | 330 | work->fn = fn; |
332 | work->arg = arg; | 331 | work->arg = arg; |
333 | work->done = done; | 332 | work->done = done; |
334 | cpu_stop_queue_work(cpu, work); | 333 | if (cpu_stop_queue_work(cpu, work)) |
334 | queued = true; | ||
335 | } | 335 | } |
336 | lg_global_unlock(&stop_cpus_lock); | 336 | lg_global_unlock(&stop_cpus_lock); |
337 | |||
338 | return queued; | ||
337 | } | 339 | } |
338 | 340 | ||
339 | static int __stop_cpus(const struct cpumask *cpumask, | 341 | static int __stop_cpus(const struct cpumask *cpumask, |
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask, | |||
342 | struct cpu_stop_done done; | 344 | struct cpu_stop_done done; |
343 | 345 | ||
344 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); | 346 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); |
345 | queue_stop_cpus_work(cpumask, fn, arg, &done); | 347 | if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) |
348 | return -ENOENT; | ||
346 | wait_for_completion(&done.completion); | 349 | wait_for_completion(&done.completion); |
347 | return done.executed ? done.ret : -ENOENT; | 350 | return done.ret; |
348 | } | 351 | } |
349 | 352 | ||
350 | /** | 353 | /** |
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu) | |||
432 | { | 435 | { |
433 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 436 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
434 | struct cpu_stop_work *work; | 437 | struct cpu_stop_work *work; |
435 | int ret; | ||
436 | 438 | ||
437 | repeat: | 439 | repeat: |
438 | work = NULL; | 440 | work = NULL; |
@@ -448,23 +450,19 @@ repeat: | |||
448 | cpu_stop_fn_t fn = work->fn; | 450 | cpu_stop_fn_t fn = work->fn; |
449 | void *arg = work->arg; | 451 | void *arg = work->arg; |
450 | struct cpu_stop_done *done = work->done; | 452 | struct cpu_stop_done *done = work->done; |
451 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; | 453 | int ret; |
452 | |||
453 | /* cpu stop callbacks are not allowed to sleep */ | ||
454 | preempt_disable(); | ||
455 | 454 | ||
455 | /* cpu stop callbacks must not sleep, make in_atomic() == T */ | ||
456 | preempt_count_inc(); | ||
456 | ret = fn(arg); | 457 | ret = fn(arg); |
457 | if (ret) | 458 | if (done) { |
458 | done->ret = ret; | 459 | if (ret) |
459 | 460 | done->ret = ret; | |
460 | /* restore preemption and check it's still balanced */ | 461 | cpu_stop_signal_done(done); |
461 | preempt_enable(); | 462 | } |
463 | preempt_count_dec(); | ||
462 | WARN_ONCE(preempt_count(), | 464 | WARN_ONCE(preempt_count(), |
463 | "cpu_stop: %s(%p) leaked preempt count\n", | 465 | "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); |
464 | kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, | ||
465 | ksym_buf), arg); | ||
466 | |||
467 | cpu_stop_signal_done(done, true); | ||
468 | goto repeat; | 466 | goto repeat; |
469 | } | 467 | } |
470 | } | 468 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7c7ec4515983..11ce59916c1a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -694,11 +694,11 @@ out: | |||
694 | return tick; | 694 | return tick; |
695 | } | 695 | } |
696 | 696 | ||
697 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 697 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) |
698 | { | 698 | { |
699 | /* Update jiffies first */ | 699 | /* Update jiffies first */ |
700 | tick_do_update_jiffies64(now); | 700 | tick_do_update_jiffies64(now); |
701 | update_cpu_load_nohz(); | 701 | update_cpu_load_nohz(active); |
702 | 702 | ||
703 | calc_load_exit_idle(); | 703 | calc_load_exit_idle(); |
704 | touch_softlockup_watchdog(); | 704 | touch_softlockup_watchdog(); |
@@ -725,7 +725,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) | |||
725 | if (can_stop_full_tick()) | 725 | if (can_stop_full_tick()) |
726 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 726 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
727 | else if (ts->tick_stopped) | 727 | else if (ts->tick_stopped) |
728 | tick_nohz_restart_sched_tick(ts, ktime_get()); | 728 | tick_nohz_restart_sched_tick(ts, ktime_get(), 1); |
729 | #endif | 729 | #endif |
730 | } | 730 | } |
731 | 731 | ||
@@ -875,7 +875,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | |||
875 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 875 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
876 | unsigned long ticks; | 876 | unsigned long ticks; |
877 | 877 | ||
878 | if (vtime_accounting_enabled()) | 878 | if (vtime_accounting_cpu_enabled()) |
879 | return; | 879 | return; |
880 | /* | 880 | /* |
881 | * We stopped the tick in idle. Update process times would miss the | 881 | * We stopped the tick in idle. Update process times would miss the |
@@ -916,7 +916,7 @@ void tick_nohz_idle_exit(void) | |||
916 | tick_nohz_stop_idle(ts, now); | 916 | tick_nohz_stop_idle(ts, now); |
917 | 917 | ||
918 | if (ts->tick_stopped) { | 918 | if (ts->tick_stopped) { |
919 | tick_nohz_restart_sched_tick(ts, now); | 919 | tick_nohz_restart_sched_tick(ts, now, 0); |
920 | tick_nohz_account_idle_ticks(ts); | 920 | tick_nohz_account_idle_ticks(ts); |
921 | } | 921 | } |
922 | 922 | ||