aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 18:13:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 18:13:38 -0500
commitaf345201ea948d0976d775958d8aa22fe5e5ba58 (patch)
tree2badae3f02ff9415c86a2188b0b5d565dc257a6c
parent4bd20db2c027eab7490e3c0466734738bef2dd24 (diff)
parent0905f04eb21fc1c2e690bed5d0418a061d56c225 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - tickless load average calculation enhancements (Byungchul Park) - vtime handling enhancements (Frederic Weisbecker) - scalability improvement via properly aligning a key structure field (Jiri Olsa) - various stop_machine() fixes (Oleg Nesterov) - sched/numa enhancement (Rik van Riel) - various fixes and improvements (Andi Kleen, Dietmar Eggemann, Geliang Tang, Hiroshi Shimamoto, Joonwoo Park, Peter Zijlstra, Waiman Long, Wanpeng Li, Yuyang Du)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() sched/core: Move sched_entity::avg into separate cache line x86/fpu: Properly align size in CHECK_MEMBER_AT_END_OF() macro sched/deadline: Fix the earliest_dl.next logic sched/fair: Disable the task group load_avg update for the root_task_group sched/fair: Move the cache-hot 'load_avg' variable into its own cacheline sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() sched/core: Move the sched_to_prio[] arrays out of line sched/cputime: Convert vtime_seqlock to seqcount sched/cputime: Introduce vtime accounting check for readers sched/cputime: Rename vtime_accounting_enabled() to vtime_accounting_cpu_enabled() sched/cputime: Correctly handle task guest time on housekeepers sched/cputime: Clarify vtime symbols and document them sched/cputime: Remove extra cost in task_cputime() sched/fair: Make it possible to account fair load avg consistently sched/fair: Modify the comment about lock assumptions in migrate_task_rq_fair() stop_machine: Clean up the usage of the preemption counter in cpu_stopper_thread() stop_machine: Shift the 'done != NULL' check from cpu_stop_signal_done() to callers stop_machine: Kill cpu_stop_done->executed stop_machine: Change __stop_cpus() to rely on cpu_stop_queue_work() ...
-rw-r--r--arch/x86/kernel/fpu/init.c13
-rw-r--r--include/linux/context_tracking.h4
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/sched.h20
-rw-r--r--include/linux/stop_machine.h7
-rw-r--r--include/linux/vtime.h25
-rw-r--r--include/linux/wait.h30
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/core.c76
-rw-r--r--kernel/sched/cputime.c74
-rw-r--r--kernel/sched/deadline.c59
-rw-r--r--kernel/sched/fair.c312
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h68
-rw-r--r--kernel/stop_machine.c84
-rw-r--r--kernel/time/tick-sched.c10
17 files changed, 485 insertions, 306 deletions
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index be39b5fde4b9..8e839e7f5e2f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -143,9 +143,18 @@ static void __init fpu__init_system_generic(void)
143unsigned int xstate_size; 143unsigned int xstate_size;
144EXPORT_SYMBOL_GPL(xstate_size); 144EXPORT_SYMBOL_GPL(xstate_size);
145 145
146/* Enforce that 'MEMBER' is the last field of 'TYPE': */ 146/* Get alignment of the TYPE. */
147#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
148
149/*
150 * Enforce that 'MEMBER' is the last field of 'TYPE'.
151 *
152 * Align the computed size with alignment of the TYPE,
153 * because that's how C aligns structs.
154 */
147#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ 155#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
148 BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) 156 BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
157 TYPE_ALIGN(TYPE)))
149 158
150/* 159/*
151 * We append the 'struct fpu' to the task_struct: 160 * We append the 'struct fpu' to the task_struct:
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 68b575afe5f5..d259274238db 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -86,7 +86,7 @@ static inline void context_tracking_init(void) { }
86#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 86#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
87static inline void guest_enter(void) 87static inline void guest_enter(void)
88{ 88{
89 if (vtime_accounting_enabled()) 89 if (vtime_accounting_cpu_enabled())
90 vtime_guest_enter(current); 90 vtime_guest_enter(current);
91 else 91 else
92 current->flags |= PF_VCPU; 92 current->flags |= PF_VCPU;
@@ -100,7 +100,7 @@ static inline void guest_exit(void)
100 if (context_tracking_is_enabled()) 100 if (context_tracking_is_enabled())
101 __context_tracking_exit(CONTEXT_GUEST); 101 __context_tracking_exit(CONTEXT_GUEST);
102 102
103 if (vtime_accounting_enabled()) 103 if (vtime_accounting_cpu_enabled())
104 vtime_guest_exit(current); 104 vtime_guest_exit(current);
105 else 105 else
106 current->flags &= ~PF_VCPU; 106 current->flags &= ~PF_VCPU;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1c1ff7e4faa4..f2cb8d45513d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -150,7 +150,7 @@ extern struct task_group root_task_group;
150 150
151#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 151#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
152# define INIT_VTIME(tsk) \ 152# define INIT_VTIME(tsk) \
153 .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \ 153 .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
154 .vtime_snap = 0, \ 154 .vtime_snap = 0, \
155 .vtime_snap_whence = VTIME_SYS, 155 .vtime_snap_whence = VTIME_SYS,
156#else 156#else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fa39434e3fdd..0c0e78102850 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
177extern void calc_global_load(unsigned long ticks); 177extern void calc_global_load(unsigned long ticks);
178 178
179#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 179#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
180extern void update_cpu_load_nohz(void); 180extern void update_cpu_load_nohz(int active);
181#else 181#else
182static inline void update_cpu_load_nohz(void) { } 182static inline void update_cpu_load_nohz(int active) { }
183#endif 183#endif
184 184
185extern unsigned long get_parent_ip(unsigned long addr); 185extern unsigned long get_parent_ip(unsigned long addr);
@@ -1268,8 +1268,13 @@ struct sched_entity {
1268#endif 1268#endif
1269 1269
1270#ifdef CONFIG_SMP 1270#ifdef CONFIG_SMP
1271 /* Per entity load average tracking */ 1271 /*
1272 struct sched_avg avg; 1272 * Per entity load average tracking.
1273 *
1274 * Put into separate cache line so it does not
1275 * collide with read-mostly values above.
1276 */
1277 struct sched_avg avg ____cacheline_aligned_in_smp;
1273#endif 1278#endif
1274}; 1279};
1275 1280
@@ -1520,11 +1525,14 @@ struct task_struct {
1520 cputime_t gtime; 1525 cputime_t gtime;
1521 struct prev_cputime prev_cputime; 1526 struct prev_cputime prev_cputime;
1522#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1527#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1523 seqlock_t vtime_seqlock; 1528 seqcount_t vtime_seqcount;
1524 unsigned long long vtime_snap; 1529 unsigned long long vtime_snap;
1525 enum { 1530 enum {
1526 VTIME_SLEEPING = 0, 1531 /* Task is sleeping or running in a CPU with VTIME inactive */
1532 VTIME_INACTIVE = 0,
1533 /* Task runs in userspace in a CPU with VTIME active */
1527 VTIME_USER, 1534 VTIME_USER,
1535 /* Task runs in kernelspace in a CPU with VTIME active */
1528 VTIME_SYS, 1536 VTIME_SYS,
1529 } vtime_snap_whence; 1537 } vtime_snap_whence;
1530#endif 1538#endif
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0e1b1540597a..3cc9632dcc2a 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -29,7 +29,7 @@ struct cpu_stop_work {
29 29
30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); 30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
31int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); 31int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
32void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 32bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
33 struct cpu_stop_work *work_buf); 33 struct cpu_stop_work *work_buf);
34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
35int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 35int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
@@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work)
65 preempt_enable(); 65 preempt_enable();
66} 66}
67 67
68static inline void stop_one_cpu_nowait(unsigned int cpu, 68static inline bool stop_one_cpu_nowait(unsigned int cpu,
69 cpu_stop_fn_t fn, void *arg, 69 cpu_stop_fn_t fn, void *arg,
70 struct cpu_stop_work *work_buf) 70 struct cpu_stop_work *work_buf)
71{ 71{
@@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu,
74 work_buf->fn = fn; 74 work_buf->fn = fn;
75 work_buf->arg = arg; 75 work_buf->arg = arg;
76 schedule_work(&work_buf->work); 76 schedule_work(&work_buf->work);
77 return true;
77 } 78 }
79
80 return false;
78} 81}
79 82
80static inline int stop_cpus(const struct cpumask *cpumask, 83static inline int stop_cpus(const struct cpumask *cpumask,
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index c5165fd256f9..fa2196990f84 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -10,16 +10,27 @@
10struct task_struct; 10struct task_struct;
11 11
12/* 12/*
13 * vtime_accounting_enabled() definitions/declarations 13 * vtime_accounting_cpu_enabled() definitions/declarations
14 */ 14 */
15#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 15#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
16static inline bool vtime_accounting_enabled(void) { return true; } 16static inline bool vtime_accounting_cpu_enabled(void) { return true; }
17#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 17#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
18 18
19#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 19#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
20/*
21 * Checks if vtime is enabled on some CPU. Cputime readers want to be careful
22 * in that case and compute the tickless cputime.
23 * For now vtime state is tied to context tracking. We might want to decouple
24 * those later if necessary.
25 */
20static inline bool vtime_accounting_enabled(void) 26static inline bool vtime_accounting_enabled(void)
21{ 27{
22 if (context_tracking_is_enabled()) { 28 return context_tracking_is_enabled();
29}
30
31static inline bool vtime_accounting_cpu_enabled(void)
32{
33 if (vtime_accounting_enabled()) {
23 if (context_tracking_cpu_is_enabled()) 34 if (context_tracking_cpu_is_enabled())
24 return true; 35 return true;
25 } 36 }
@@ -29,7 +40,7 @@ static inline bool vtime_accounting_enabled(void)
29#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 40#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
30 41
31#ifndef CONFIG_VIRT_CPU_ACCOUNTING 42#ifndef CONFIG_VIRT_CPU_ACCOUNTING
32static inline bool vtime_accounting_enabled(void) { return false; } 43static inline bool vtime_accounting_cpu_enabled(void) { return false; }
33#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 44#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
34 45
35 46
@@ -44,7 +55,7 @@ extern void vtime_task_switch(struct task_struct *prev);
44extern void vtime_common_task_switch(struct task_struct *prev); 55extern void vtime_common_task_switch(struct task_struct *prev);
45static inline void vtime_task_switch(struct task_struct *prev) 56static inline void vtime_task_switch(struct task_struct *prev)
46{ 57{
47 if (vtime_accounting_enabled()) 58 if (vtime_accounting_cpu_enabled())
48 vtime_common_task_switch(prev); 59 vtime_common_task_switch(prev);
49} 60}
50#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ 61#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
@@ -59,7 +70,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk);
59extern void vtime_common_account_irq_enter(struct task_struct *tsk); 70extern void vtime_common_account_irq_enter(struct task_struct *tsk);
60static inline void vtime_account_irq_enter(struct task_struct *tsk) 71static inline void vtime_account_irq_enter(struct task_struct *tsk)
61{ 72{
62 if (vtime_accounting_enabled()) 73 if (vtime_accounting_cpu_enabled())
63 vtime_common_account_irq_enter(tsk); 74 vtime_common_account_irq_enter(tsk);
64} 75}
65#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 76#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -78,7 +89,7 @@ extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
78 89
79static inline void vtime_account_irq_exit(struct task_struct *tsk) 90static inline void vtime_account_irq_exit(struct task_struct *tsk)
80{ 91{
81 if (vtime_accounting_enabled()) 92 if (vtime_accounting_cpu_enabled())
82 vtime_gen_account_irq_exit(tsk); 93 vtime_gen_account_irq_exit(tsk);
83} 94}
84 95
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 513b36f04dfd..d2f4ec7dba7c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
102 q->func = func; 102 q->func = func;
103} 103}
104 104
105/**
106 * waitqueue_active -- locklessly test for waiters on the queue
107 * @q: the waitqueue to test for waiters
108 *
109 * returns true if the wait list is not empty
110 *
111 * NOTE: this function is lockless and requires care, incorrect usage _will_
112 * lead to sporadic and non-obvious failure.
113 *
114 * Use either while holding wait_queue_head_t::lock or when used for wakeups
115 * with an extra smp_mb() like:
116 *
117 * CPU0 - waker CPU1 - waiter
118 *
119 * for (;;) {
120 * @cond = true; prepare_to_wait(&wq, &wait, state);
121 * smp_mb(); // smp_mb() from set_current_state()
122 * if (waitqueue_active(wq)) if (@cond)
123 * wake_up(wq); break;
124 * schedule();
125 * }
126 * finish_wait(&wq, &wait);
127 *
128 * Because without the explicit smp_mb() it's possible for the
129 * waitqueue_active() load to get hoisted over the @cond store such that we'll
130 * observe an empty wait list while the waiter might not observe @cond.
131 *
132 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
133 * which (when the lock is uncontended) are of roughly equal cost.
134 */
105static inline int waitqueue_active(wait_queue_head_t *q) 135static inline int waitqueue_active(wait_queue_head_t *q)
106{ 136{
107 return !list_empty(&q->task_list); 137 return !list_empty(&q->task_list);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1155eac61687..291b08cc817b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1349,9 +1349,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1349 prev_cputime_init(&p->prev_cputime); 1349 prev_cputime_init(&p->prev_cputime);
1350 1350
1351#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1351#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1352 seqlock_init(&p->vtime_seqlock); 1352 seqcount_init(&p->vtime_seqcount);
1353 p->vtime_snap = 0; 1353 p->vtime_snap = 0;
1354 p->vtime_snap_whence = VTIME_SLEEPING; 1354 p->vtime_snap_whence = VTIME_INACTIVE;
1355#endif 1355#endif
1356 1356
1357#if defined(SPLIT_RSS_COUNTING) 1357#if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601ddf7..a5d966cb8891 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
212 ag = autogroup_task_get(p); 212 ag = autogroup_task_get(p);
213 213
214 down_write(&ag->lock); 214 down_write(&ag->lock);
215 err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); 215 err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
216 if (!err) 216 if (!err)
217 ag->nice = nice; 217 ag->nice = nice;
218 up_write(&ag->lock); 218 up_write(&ag->lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34cb9f7fc2d2..77d97a6fc715 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
731 if (current->policy == SCHED_RR) { 731 if (current->policy == SCHED_RR) {
732 struct sched_rt_entity *rt_se = &current->rt; 732 struct sched_rt_entity *rt_se = &current->rt;
733 733
734 return rt_se->run_list.prev == rt_se->run_list.next; 734 return list_is_singular(&rt_se->run_list);
735 } 735 }
736 736
737 /* 737 /*
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p)
823 return; 823 return;
824 } 824 }
825 825
826 load->weight = scale_load(prio_to_weight[prio]); 826 load->weight = scale_load(sched_prio_to_weight[prio]);
827 load->inv_weight = prio_to_wmult[prio]; 827 load->inv_weight = sched_prio_to_wmult[prio];
828} 828}
829 829
830static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 830static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
1071{ 1071{
1072 lockdep_assert_held(&rq->lock); 1072 lockdep_assert_held(&rq->lock);
1073 1073
1074 dequeue_task(rq, p, 0);
1075 p->on_rq = TASK_ON_RQ_MIGRATING; 1074 p->on_rq = TASK_ON_RQ_MIGRATING;
1075 dequeue_task(rq, p, 0);
1076 set_task_cpu(p, new_cpu); 1076 set_task_cpu(p, new_cpu);
1077 raw_spin_unlock(&rq->lock); 1077 raw_spin_unlock(&rq->lock);
1078 1078
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
1080 1080
1081 raw_spin_lock(&rq->lock); 1081 raw_spin_lock(&rq->lock);
1082 BUG_ON(task_cpu(p) != new_cpu); 1082 BUG_ON(task_cpu(p) != new_cpu);
1083 p->on_rq = TASK_ON_RQ_QUEUED;
1084 enqueue_task(rq, p, 0); 1083 enqueue_task(rq, p, 0);
1084 p->on_rq = TASK_ON_RQ_QUEUED;
1085 check_preempt_curr(rq, p, 0); 1085 check_preempt_curr(rq, p, 0);
1086 1086
1087 return rq; 1087 return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1274 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1274 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1275 !p->on_rq); 1275 !p->on_rq);
1276 1276
1277 /*
1278 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
1279 * because schedstat_wait_{start,end} rebase migrating task's wait_start
1280 * time relying on p->on_rq.
1281 */
1282 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1283 p->sched_class == &fair_sched_class &&
1284 (p->on_rq && !task_on_rq_migrating(p)));
1285
1277#ifdef CONFIG_LOCKDEP 1286#ifdef CONFIG_LOCKDEP
1278 /* 1287 /*
1279 * The caller should hold either p->pi_lock or rq->lock, when changing 1288 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
1310 src_rq = task_rq(p); 1319 src_rq = task_rq(p);
1311 dst_rq = cpu_rq(cpu); 1320 dst_rq = cpu_rq(cpu);
1312 1321
1322 p->on_rq = TASK_ON_RQ_MIGRATING;
1313 deactivate_task(src_rq, p, 0); 1323 deactivate_task(src_rq, p, 0);
1314 set_task_cpu(p, cpu); 1324 set_task_cpu(p, cpu);
1315 activate_task(dst_rq, p, 0); 1325 activate_task(dst_rq, p, 0);
1326 p->on_rq = TASK_ON_RQ_QUEUED;
1316 check_preempt_curr(dst_rq, p, 0); 1327 check_preempt_curr(dst_rq, p, 0);
1317 } else { 1328 } else {
1318 /* 1329 /*
@@ -2194,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2194 p->se.vruntime = 0; 2205 p->se.vruntime = 0;
2195 INIT_LIST_HEAD(&p->se.group_node); 2206 INIT_LIST_HEAD(&p->se.group_node);
2196 2207
2208#ifdef CONFIG_FAIR_GROUP_SCHED
2209 p->se.cfs_rq = NULL;
2210#endif
2211
2197#ifdef CONFIG_SCHEDSTATS 2212#ifdef CONFIG_SCHEDSTATS
2198 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2213 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2199#endif 2214#endif
@@ -7442,6 +7457,9 @@ int in_sched_functions(unsigned long addr)
7442 */ 7457 */
7443struct task_group root_task_group; 7458struct task_group root_task_group;
7444LIST_HEAD(task_groups); 7459LIST_HEAD(task_groups);
7460
7461/* Cacheline aligned slab cache for task_group */
7462static struct kmem_cache *task_group_cache __read_mostly;
7445#endif 7463#endif
7446 7464
7447DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7465DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7499,11 +7517,12 @@ void __init sched_init(void)
7499#endif /* CONFIG_RT_GROUP_SCHED */ 7517#endif /* CONFIG_RT_GROUP_SCHED */
7500 7518
7501#ifdef CONFIG_CGROUP_SCHED 7519#ifdef CONFIG_CGROUP_SCHED
7520 task_group_cache = KMEM_CACHE(task_group, 0);
7521
7502 list_add(&root_task_group.list, &task_groups); 7522 list_add(&root_task_group.list, &task_groups);
7503 INIT_LIST_HEAD(&root_task_group.children); 7523 INIT_LIST_HEAD(&root_task_group.children);
7504 INIT_LIST_HEAD(&root_task_group.siblings); 7524 INIT_LIST_HEAD(&root_task_group.siblings);
7505 autogroup_init(&init_task); 7525 autogroup_init(&init_task);
7506
7507#endif /* CONFIG_CGROUP_SCHED */ 7526#endif /* CONFIG_CGROUP_SCHED */
7508 7527
7509 for_each_possible_cpu(i) { 7528 for_each_possible_cpu(i) {
@@ -7784,7 +7803,7 @@ static void free_sched_group(struct task_group *tg)
7784 free_fair_sched_group(tg); 7803 free_fair_sched_group(tg);
7785 free_rt_sched_group(tg); 7804 free_rt_sched_group(tg);
7786 autogroup_free(tg); 7805 autogroup_free(tg);
7787 kfree(tg); 7806 kmem_cache_free(task_group_cache, tg);
7788} 7807}
7789 7808
7790/* allocate runqueue etc for a new task group */ 7809/* allocate runqueue etc for a new task group */
@@ -7792,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent)
7792{ 7811{
7793 struct task_group *tg; 7812 struct task_group *tg;
7794 7813
7795 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7814 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
7796 if (!tg) 7815 if (!tg)
7797 return ERR_PTR(-ENOMEM); 7816 return ERR_PTR(-ENOMEM);
7798 7817
@@ -8697,3 +8716,44 @@ void dump_cpu_task(int cpu)
8697 pr_info("Task dump for CPU %d:\n", cpu); 8716 pr_info("Task dump for CPU %d:\n", cpu);
8698 sched_show_task(cpu_curr(cpu)); 8717 sched_show_task(cpu_curr(cpu));
8699} 8718}
8719
8720/*
8721 * Nice levels are multiplicative, with a gentle 10% change for every
8722 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
8723 * nice 1, it will get ~10% less CPU time than another CPU-bound task
8724 * that remained on nice 0.
8725 *
8726 * The "10% effect" is relative and cumulative: from _any_ nice level,
8727 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
8728 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
8729 * If a task goes up by ~10% and another task goes down by ~10% then
8730 * the relative distance between them is ~25%.)
8731 */
8732const int sched_prio_to_weight[40] = {
8733 /* -20 */ 88761, 71755, 56483, 46273, 36291,
8734 /* -15 */ 29154, 23254, 18705, 14949, 11916,
8735 /* -10 */ 9548, 7620, 6100, 4904, 3906,
8736 /* -5 */ 3121, 2501, 1991, 1586, 1277,
8737 /* 0 */ 1024, 820, 655, 526, 423,
8738 /* 5 */ 335, 272, 215, 172, 137,
8739 /* 10 */ 110, 87, 70, 56, 45,
8740 /* 15 */ 36, 29, 23, 18, 15,
8741};
8742
8743/*
8744 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
8745 *
8746 * In cases where the weight does not change often, we can use the
8747 * precalculated inverse to speed up arithmetics by turning divisions
8748 * into multiplications:
8749 */
8750const u32 sched_prio_to_wmult[40] = {
8751 /* -20 */ 48388, 59856, 76040, 92818, 118348,
8752 /* -15 */ 147320, 184698, 229616, 287308, 360437,
8753 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
8754 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
8755 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
8756 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
8757 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
8758 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
8759};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b48586..d5ff5c6bf829 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -466,7 +466,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
466 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 466 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
467 struct rq *rq = this_rq(); 467 struct rq *rq = this_rq();
468 468
469 if (vtime_accounting_enabled()) 469 if (vtime_accounting_cpu_enabled())
470 return; 470 return;
471 471
472 if (sched_clock_irqtime) { 472 if (sched_clock_irqtime) {
@@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
680{ 680{
681 unsigned long long delta = vtime_delta(tsk); 681 unsigned long long delta = vtime_delta(tsk);
682 682
683 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); 683 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
684 tsk->vtime_snap += delta; 684 tsk->vtime_snap += delta;
685 685
686 /* CHECKME: always safe to convert nsecs to cputime? */ 686 /* CHECKME: always safe to convert nsecs to cputime? */
@@ -696,37 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk)
696 696
697void vtime_account_system(struct task_struct *tsk) 697void vtime_account_system(struct task_struct *tsk)
698{ 698{
699 write_seqlock(&tsk->vtime_seqlock); 699 write_seqcount_begin(&tsk->vtime_seqcount);
700 __vtime_account_system(tsk); 700 __vtime_account_system(tsk);
701 write_sequnlock(&tsk->vtime_seqlock); 701 write_seqcount_end(&tsk->vtime_seqcount);
702} 702}
703 703
704void vtime_gen_account_irq_exit(struct task_struct *tsk) 704void vtime_gen_account_irq_exit(struct task_struct *tsk)
705{ 705{
706 write_seqlock(&tsk->vtime_seqlock); 706 write_seqcount_begin(&tsk->vtime_seqcount);
707 __vtime_account_system(tsk); 707 __vtime_account_system(tsk);
708 if (context_tracking_in_user()) 708 if (context_tracking_in_user())
709 tsk->vtime_snap_whence = VTIME_USER; 709 tsk->vtime_snap_whence = VTIME_USER;
710 write_sequnlock(&tsk->vtime_seqlock); 710 write_seqcount_end(&tsk->vtime_seqcount);
711} 711}
712 712
713void vtime_account_user(struct task_struct *tsk) 713void vtime_account_user(struct task_struct *tsk)
714{ 714{
715 cputime_t delta_cpu; 715 cputime_t delta_cpu;
716 716
717 write_seqlock(&tsk->vtime_seqlock); 717 write_seqcount_begin(&tsk->vtime_seqcount);
718 delta_cpu = get_vtime_delta(tsk); 718 delta_cpu = get_vtime_delta(tsk);
719 tsk->vtime_snap_whence = VTIME_SYS; 719 tsk->vtime_snap_whence = VTIME_SYS;
720 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 720 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
721 write_sequnlock(&tsk->vtime_seqlock); 721 write_seqcount_end(&tsk->vtime_seqcount);
722} 722}
723 723
724void vtime_user_enter(struct task_struct *tsk) 724void vtime_user_enter(struct task_struct *tsk)
725{ 725{
726 write_seqlock(&tsk->vtime_seqlock); 726 write_seqcount_begin(&tsk->vtime_seqcount);
727 __vtime_account_system(tsk); 727 __vtime_account_system(tsk);
728 tsk->vtime_snap_whence = VTIME_USER; 728 tsk->vtime_snap_whence = VTIME_USER;
729 write_sequnlock(&tsk->vtime_seqlock); 729 write_seqcount_end(&tsk->vtime_seqcount);
730} 730}
731 731
732void vtime_guest_enter(struct task_struct *tsk) 732void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk)
738 * synchronization against the reader (task_gtime()) 738 * synchronization against the reader (task_gtime())
739 * that can thus safely catch up with a tickless delta. 739 * that can thus safely catch up with a tickless delta.
740 */ 740 */
741 write_seqlock(&tsk->vtime_seqlock); 741 write_seqcount_begin(&tsk->vtime_seqcount);
742 __vtime_account_system(tsk); 742 __vtime_account_system(tsk);
743 current->flags |= PF_VCPU; 743 current->flags |= PF_VCPU;
744 write_sequnlock(&tsk->vtime_seqlock); 744 write_seqcount_end(&tsk->vtime_seqcount);
745} 745}
746EXPORT_SYMBOL_GPL(vtime_guest_enter); 746EXPORT_SYMBOL_GPL(vtime_guest_enter);
747 747
748void vtime_guest_exit(struct task_struct *tsk) 748void vtime_guest_exit(struct task_struct *tsk)
749{ 749{
750 write_seqlock(&tsk->vtime_seqlock); 750 write_seqcount_begin(&tsk->vtime_seqcount);
751 __vtime_account_system(tsk); 751 __vtime_account_system(tsk);
752 current->flags &= ~PF_VCPU; 752 current->flags &= ~PF_VCPU;
753 write_sequnlock(&tsk->vtime_seqlock); 753 write_seqcount_end(&tsk->vtime_seqcount);
754} 754}
755EXPORT_SYMBOL_GPL(vtime_guest_exit); 755EXPORT_SYMBOL_GPL(vtime_guest_exit);
756 756
@@ -763,24 +763,26 @@ void vtime_account_idle(struct task_struct *tsk)
763 763
764void arch_vtime_task_switch(struct task_struct *prev) 764void arch_vtime_task_switch(struct task_struct *prev)
765{ 765{
766 write_seqlock(&prev->vtime_seqlock); 766 write_seqcount_begin(&prev->vtime_seqcount);
767 prev->vtime_snap_whence = VTIME_SLEEPING; 767 prev->vtime_snap_whence = VTIME_INACTIVE;
768 write_sequnlock(&prev->vtime_seqlock); 768 write_seqcount_end(&prev->vtime_seqcount);
769 769
770 write_seqlock(&current->vtime_seqlock); 770 write_seqcount_begin(&current->vtime_seqcount);
771 current->vtime_snap_whence = VTIME_SYS; 771 current->vtime_snap_whence = VTIME_SYS;
772 current->vtime_snap = sched_clock_cpu(smp_processor_id()); 772 current->vtime_snap = sched_clock_cpu(smp_processor_id());
773 write_sequnlock(&current->vtime_seqlock); 773 write_seqcount_end(&current->vtime_seqcount);
774} 774}
775 775
776void vtime_init_idle(struct task_struct *t, int cpu) 776void vtime_init_idle(struct task_struct *t, int cpu)
777{ 777{
778 unsigned long flags; 778 unsigned long flags;
779 779
780 write_seqlock_irqsave(&t->vtime_seqlock, flags); 780 local_irq_save(flags);
781 write_seqcount_begin(&t->vtime_seqcount);
781 t->vtime_snap_whence = VTIME_SYS; 782 t->vtime_snap_whence = VTIME_SYS;
782 t->vtime_snap = sched_clock_cpu(cpu); 783 t->vtime_snap = sched_clock_cpu(cpu);
783 write_sequnlock_irqrestore(&t->vtime_seqlock, flags); 784 write_seqcount_end(&t->vtime_seqcount);
785 local_irq_restore(flags);
784} 786}
785 787
786cputime_t task_gtime(struct task_struct *t) 788cputime_t task_gtime(struct task_struct *t)
@@ -788,17 +790,17 @@ cputime_t task_gtime(struct task_struct *t)
788 unsigned int seq; 790 unsigned int seq;
789 cputime_t gtime; 791 cputime_t gtime;
790 792
791 if (!context_tracking_is_enabled()) 793 if (!vtime_accounting_enabled())
792 return t->gtime; 794 return t->gtime;
793 795
794 do { 796 do {
795 seq = read_seqbegin(&t->vtime_seqlock); 797 seq = read_seqcount_begin(&t->vtime_seqcount);
796 798
797 gtime = t->gtime; 799 gtime = t->gtime;
798 if (t->flags & PF_VCPU) 800 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
799 gtime += vtime_delta(t); 801 gtime += vtime_delta(t);
800 802
801 } while (read_seqretry(&t->vtime_seqlock, seq)); 803 } while (read_seqcount_retry(&t->vtime_seqcount, seq));
802 804
803 return gtime; 805 return gtime;
804} 806}
@@ -821,7 +823,7 @@ fetch_task_cputime(struct task_struct *t,
821 *udelta = 0; 823 *udelta = 0;
822 *sdelta = 0; 824 *sdelta = 0;
823 825
824 seq = read_seqbegin(&t->vtime_seqlock); 826 seq = read_seqcount_begin(&t->vtime_seqcount);
825 827
826 if (u_dst) 828 if (u_dst)
827 *u_dst = *u_src; 829 *u_dst = *u_src;
@@ -829,7 +831,7 @@ fetch_task_cputime(struct task_struct *t,
829 *s_dst = *s_src; 831 *s_dst = *s_src;
830 832
831 /* Task is sleeping, nothing to add */ 833 /* Task is sleeping, nothing to add */
832 if (t->vtime_snap_whence == VTIME_SLEEPING || 834 if (t->vtime_snap_whence == VTIME_INACTIVE ||
833 is_idle_task(t)) 835 is_idle_task(t))
834 continue; 836 continue;
835 837
@@ -845,7 +847,7 @@ fetch_task_cputime(struct task_struct *t,
845 if (t->vtime_snap_whence == VTIME_SYS) 847 if (t->vtime_snap_whence == VTIME_SYS)
846 *sdelta = delta; 848 *sdelta = delta;
847 } 849 }
848 } while (read_seqretry(&t->vtime_seqlock, seq)); 850 } while (read_seqcount_retry(&t->vtime_seqcount, seq));
849} 851}
850 852
851 853
@@ -853,6 +855,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
853{ 855{
854 cputime_t udelta, sdelta; 856 cputime_t udelta, sdelta;
855 857
858 if (!vtime_accounting_enabled()) {
859 if (utime)
860 *utime = t->utime;
861 if (stime)
862 *stime = t->stime;
863 return;
864 }
865
856 fetch_task_cputime(t, utime, stime, &t->utime, 866 fetch_task_cputime(t, utime, stime, &t->utime,
857 &t->stime, &udelta, &sdelta); 867 &t->stime, &udelta, &sdelta);
858 if (utime) 868 if (utime)
@@ -866,6 +876,14 @@ void task_cputime_scaled(struct task_struct *t,
866{ 876{
867 cputime_t udelta, sdelta; 877 cputime_t udelta, sdelta;
868 878
879 if (!vtime_accounting_enabled()) {
880 if (utimescaled)
881 *utimescaled = t->utimescaled;
882 if (stimescaled)
883 *stimescaled = t->stimescaled;
884 return;
885 }
886
869 fetch_task_cputime(t, utimescaled, stimescaled, 887 fetch_task_cputime(t, utimescaled, stimescaled,
870 &t->utimescaled, &t->stimescaled, &udelta, &sdelta); 888 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
871 if (utimescaled) 889 if (utimescaled)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e285f9..cd64c979d0e1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
176 } 176 }
177 } 177 }
178 178
179 if (leftmost) 179 if (leftmost) {
180 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; 180 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
181 dl_rq->earliest_dl.next = p->dl.deadline;
182 }
181 183
182 rb_link_node(&p->pushable_dl_tasks, parent, link); 184 rb_link_node(&p->pushable_dl_tasks, parent, link);
183 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); 185 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
195 197
196 next_node = rb_next(&p->pushable_dl_tasks); 198 next_node = rb_next(&p->pushable_dl_tasks);
197 dl_rq->pushable_dl_tasks_leftmost = next_node; 199 dl_rq->pushable_dl_tasks_leftmost = next_node;
200 if (next_node) {
201 dl_rq->earliest_dl.next = rb_entry(next_node,
202 struct task_struct, pushable_dl_tasks)->dl.deadline;
203 }
198 } 204 }
199 205
200 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); 206 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq)
782 788
783#ifdef CONFIG_SMP 789#ifdef CONFIG_SMP
784 790
785static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
786
787static inline u64 next_deadline(struct rq *rq)
788{
789 struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
790
791 if (next && dl_prio(next->prio))
792 return next->dl.deadline;
793 else
794 return 0;
795}
796
797static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) 791static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
798{ 792{
799 struct rq *rq = rq_of_dl_rq(dl_rq); 793 struct rq *rq = rq_of_dl_rq(dl_rq);
800 794
801 if (dl_rq->earliest_dl.curr == 0 || 795 if (dl_rq->earliest_dl.curr == 0 ||
802 dl_time_before(deadline, dl_rq->earliest_dl.curr)) { 796 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
803 /*
804 * If the dl_rq had no -deadline tasks, or if the new task
805 * has shorter deadline than the current one on dl_rq, we
806 * know that the previous earliest becomes our next earliest,
807 * as the new task becomes the earliest itself.
808 */
809 dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
810 dl_rq->earliest_dl.curr = deadline; 797 dl_rq->earliest_dl.curr = deadline;
811 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); 798 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
812 } else if (dl_rq->earliest_dl.next == 0 ||
813 dl_time_before(deadline, dl_rq->earliest_dl.next)) {
814 /*
815 * On the other hand, if the new -deadline task has a
816 * a later deadline than the earliest one on dl_rq, but
817 * it is earlier than the next (if any), we must
818 * recompute the next-earliest.
819 */
820 dl_rq->earliest_dl.next = next_deadline(rq);
821 } 799 }
822} 800}
823 801
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
839 817
840 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); 818 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
841 dl_rq->earliest_dl.curr = entry->deadline; 819 dl_rq->earliest_dl.curr = entry->deadline;
842 dl_rq->earliest_dl.next = next_deadline(rq);
843 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); 820 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
844 } 821 }
845} 822}
@@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1274 return 0; 1251 return 0;
1275} 1252}
1276 1253
1277/* Returns the second earliest -deadline task, NULL otherwise */
1278static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
1279{
1280 struct rb_node *next_node = rq->dl.rb_leftmost;
1281 struct sched_dl_entity *dl_se;
1282 struct task_struct *p = NULL;
1283
1284next_node:
1285 next_node = rb_next(next_node);
1286 if (next_node) {
1287 dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
1288 p = dl_task_of(dl_se);
1289
1290 if (pick_dl_task(rq, p, cpu))
1291 return p;
1292
1293 goto next_node;
1294 }
1295
1296 return NULL;
1297}
1298
1299/* 1254/*
1300 * Return the earliest pushable rq's task, which is suitable to be executed 1255 * Return the earliest pushable rq's task, which is suitable to be executed
1301 * on the CPU, NULL otherwise: 1256 * on the CPU, NULL otherwise:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfdc0e61066c..1926606ece80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
738 update_curr(cfs_rq_of(&rq->curr->se)); 738 update_curr(cfs_rq_of(&rq->curr->se));
739} 739}
740 740
741#ifdef CONFIG_SCHEDSTATS
742static inline void
743update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
744{
745 u64 wait_start = rq_clock(rq_of(cfs_rq));
746
747 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
748 likely(wait_start > se->statistics.wait_start))
749 wait_start -= se->statistics.wait_start;
750
751 se->statistics.wait_start = wait_start;
752}
753
754static void
755update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
756{
757 struct task_struct *p;
758 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
759
760 if (entity_is_task(se)) {
761 p = task_of(se);
762 if (task_on_rq_migrating(p)) {
763 /*
764 * Preserve migrating task's wait time so wait_start
765 * time stamp can be adjusted to accumulate wait time
766 * prior to migration.
767 */
768 se->statistics.wait_start = delta;
769 return;
770 }
771 trace_sched_stat_wait(p, delta);
772 }
773
774 se->statistics.wait_max = max(se->statistics.wait_max, delta);
775 se->statistics.wait_count++;
776 se->statistics.wait_sum += delta;
777 se->statistics.wait_start = 0;
778}
779#else
741static inline void 780static inline void
742update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 781update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
743{ 782{
744 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
745} 783}
746 784
785static inline void
786update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
787{
788}
789#endif
790
747/* 791/*
748 * Task is being enqueued - update stats: 792 * Task is being enqueued - update stats:
749 */ 793 */
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
757 update_stats_wait_start(cfs_rq, se); 801 update_stats_wait_start(cfs_rq, se);
758} 802}
759 803
760static void
761update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
762{
763 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
764 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
765 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
766 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
767 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768#ifdef CONFIG_SCHEDSTATS
769 if (entity_is_task(se)) {
770 trace_sched_stat_wait(task_of(se),
771 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
772 }
773#endif
774 schedstat_set(se->statistics.wait_start, 0);
775}
776
777static inline void 804static inline void
778update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 805update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779{ 806{
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work)
2155 unsigned long migrate, next_scan, now = jiffies; 2182 unsigned long migrate, next_scan, now = jiffies;
2156 struct task_struct *p = current; 2183 struct task_struct *p = current;
2157 struct mm_struct *mm = p->mm; 2184 struct mm_struct *mm = p->mm;
2185 u64 runtime = p->se.sum_exec_runtime;
2158 struct vm_area_struct *vma; 2186 struct vm_area_struct *vma;
2159 unsigned long start, end; 2187 unsigned long start, end;
2160 unsigned long nr_pte_updates = 0; 2188 unsigned long nr_pte_updates = 0;
@@ -2277,6 +2305,17 @@ out:
2277 else 2305 else
2278 reset_ptenuma_scan(p); 2306 reset_ptenuma_scan(p);
2279 up_read(&mm->mmap_sem); 2307 up_read(&mm->mmap_sem);
2308
2309 /*
2310 * Make sure tasks use at least 32x as much time to run other code
2311 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2312 * Usually update_task_scan_period slows down scanning enough; on an
2313 * overloaded system we need to limit overhead on a per task basis.
2314 */
2315 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2316 u64 diff = p->se.sum_exec_runtime - runtime;
2317 p->node_stamp += 32 * diff;
2318 }
2280} 2319}
2281 2320
2282/* 2321/*
@@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2670{ 2709{
2671 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; 2710 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2672 2711
2712 /*
2713 * No need to update load_avg for root_task_group as it is not used.
2714 */
2715 if (cfs_rq->tg == &root_task_group)
2716 return;
2717
2673 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { 2718 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2674 atomic_long_add(delta, &cfs_rq->tg->load_avg); 2719 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2675 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; 2720 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2676 } 2721 }
2677} 2722}
2678 2723
2724/*
2725 * Called within set_task_rq() right before setting a task's cpu. The
2726 * caller only guarantees p->pi_lock is held; no other assumptions,
2727 * including the state of rq->lock, should be made.
2728 */
2729void set_task_rq_fair(struct sched_entity *se,
2730 struct cfs_rq *prev, struct cfs_rq *next)
2731{
2732 if (!sched_feat(ATTACH_AGE_LOAD))
2733 return;
2734
2735 /*
2736 * We are supposed to update the task to "current" time, then its up to
2737 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2738 * getting what current time is, so simply throw away the out-of-date
2739 * time. This will result in the wakee task is less decayed, but giving
2740 * the wakee more load sounds not bad.
2741 */
2742 if (se->avg.last_update_time && prev) {
2743 u64 p_last_update_time;
2744 u64 n_last_update_time;
2745
2746#ifndef CONFIG_64BIT
2747 u64 p_last_update_time_copy;
2748 u64 n_last_update_time_copy;
2749
2750 do {
2751 p_last_update_time_copy = prev->load_last_update_time_copy;
2752 n_last_update_time_copy = next->load_last_update_time_copy;
2753
2754 smp_rmb();
2755
2756 p_last_update_time = prev->avg.last_update_time;
2757 n_last_update_time = next->avg.last_update_time;
2758
2759 } while (p_last_update_time != p_last_update_time_copy ||
2760 n_last_update_time != n_last_update_time_copy);
2761#else
2762 p_last_update_time = prev->avg.last_update_time;
2763 n_last_update_time = next->avg.last_update_time;
2764#endif
2765 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2766 &se->avg, 0, 0, NULL);
2767 se->avg.last_update_time = n_last_update_time;
2768 }
2769}
2679#else /* CONFIG_FAIR_GROUP_SCHED */ 2770#else /* CONFIG_FAIR_GROUP_SCHED */
2680static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 2771static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2681#endif /* CONFIG_FAIR_GROUP_SCHED */ 2772#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2809,48 +2900,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2809 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); 2900 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2810} 2901}
2811 2902
2812/*
2813 * Task first catches up with cfs_rq, and then subtract
2814 * itself from the cfs_rq (task must be off the queue now).
2815 */
2816void remove_entity_load_avg(struct sched_entity *se)
2817{
2818 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2819 u64 last_update_time;
2820
2821#ifndef CONFIG_64BIT 2903#ifndef CONFIG_64BIT
2904static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2905{
2822 u64 last_update_time_copy; 2906 u64 last_update_time_copy;
2907 u64 last_update_time;
2823 2908
2824 do { 2909 do {
2825 last_update_time_copy = cfs_rq->load_last_update_time_copy; 2910 last_update_time_copy = cfs_rq->load_last_update_time_copy;
2826 smp_rmb(); 2911 smp_rmb();
2827 last_update_time = cfs_rq->avg.last_update_time; 2912 last_update_time = cfs_rq->avg.last_update_time;
2828 } while (last_update_time != last_update_time_copy); 2913 } while (last_update_time != last_update_time_copy);
2829#else
2830 last_update_time = cfs_rq->avg.last_update_time;
2831#endif
2832 2914
2833 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); 2915 return last_update_time;
2834 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2835 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2836} 2916}
2837 2917#else
2838/* 2918static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
2839 * Update the rq's load with the elapsed running time before entering
2840 * idle. if the last scheduled task is not a CFS task, idle_enter will
2841 * be the only way to update the runnable statistic.
2842 */
2843void idle_enter_fair(struct rq *this_rq)
2844{ 2919{
2920 return cfs_rq->avg.last_update_time;
2845} 2921}
2922#endif
2846 2923
2847/* 2924/*
2848 * Update the rq's load with the elapsed idle time before a task is 2925 * Task first catches up with cfs_rq, and then subtract
2849 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will 2926 * itself from the cfs_rq (task must be off the queue now).
2850 * be the only way to update the runnable statistic.
2851 */ 2927 */
2852void idle_exit_fair(struct rq *this_rq) 2928void remove_entity_load_avg(struct sched_entity *se)
2853{ 2929{
2930 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2931 u64 last_update_time;
2932
2933 /*
2934 * Newly created task or never used group entity should not be removed
2935 * from its (source) cfs_rq
2936 */
2937 if (se->avg.last_update_time == 0)
2938 return;
2939
2940 last_update_time = cfs_rq_last_update_time(cfs_rq);
2941
2942 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2943 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2944 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2854} 2945}
2855 2946
2856static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) 2947static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4240 */ 4331 */
4241 4332
4242/* 4333/*
4243 * The exact cpuload at various idx values, calculated at every tick would be 4334 * The exact cpuload calculated at every tick would be:
4244 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 4335 *
4336 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4245 * 4337 *
4246 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 4338 * If a cpu misses updates for n ticks (as it was idle) and update gets
4247 * on nth tick when cpu may be busy, then we have: 4339 * called on the n+1-th tick when cpu may be busy, then we have:
4248 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 4340 *
4249 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 4341 * load_n = (1 - 1/2^i)^n * load_0
4342 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
4250 * 4343 *
4251 * decay_load_missed() below does efficient calculation of 4344 * decay_load_missed() below does efficient calculation of
4252 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 4345 *
4253 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 4346 * load' = (1 - 1/2^i)^n * load
4347 *
4348 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4349 * This allows us to precompute the above in said factors, thereby allowing the
4350 * reduction of an arbitrary n in O(log_2 n) steps. (See also
4351 * fixed_power_int())
4254 * 4352 *
4255 * The calculation is approximated on a 128 point scale. 4353 * The calculation is approximated on a 128 point scale.
4256 * degrade_zero_ticks is the number of ticks after which load at any
4257 * particular idx is approximated to be zero.
4258 * degrade_factor is a precomputed table, a row for each load idx.
4259 * Each column corresponds to degradation factor for a power of two ticks,
4260 * based on 128 point scale.
4261 * Example:
4262 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4263 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4264 *
4265 * With this power of 2 load factors, we can degrade the load n times
4266 * by looking at 1 bits in n and doing as many mult/shift instead of
4267 * n mult/shifts needed by the exact degradation.
4268 */ 4354 */
4269#define DEGRADE_SHIFT 7 4355#define DEGRADE_SHIFT 7
4270static const unsigned char 4356
4271 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 4357static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4272static const unsigned char 4358static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4273 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 4359 { 0, 0, 0, 0, 0, 0, 0, 0 },
4274 {0, 0, 0, 0, 0, 0, 0, 0}, 4360 { 64, 32, 8, 0, 0, 0, 0, 0 },
4275 {64, 32, 8, 0, 0, 0, 0, 0}, 4361 { 96, 72, 40, 12, 1, 0, 0, 0 },
4276 {96, 72, 40, 12, 1, 0, 0}, 4362 { 112, 98, 75, 43, 15, 1, 0, 0 },
4277 {112, 98, 75, 43, 15, 1, 0}, 4363 { 120, 112, 98, 76, 45, 16, 2, 0 }
4278 {120, 112, 98, 76, 45, 16, 2} }; 4364};
4279 4365
4280/* 4366/*
4281 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 4367 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4306 return load; 4392 return load;
4307} 4393}
4308 4394
4309/* 4395/**
4396 * __update_cpu_load - update the rq->cpu_load[] statistics
4397 * @this_rq: The rq to update statistics for
4398 * @this_load: The current load
4399 * @pending_updates: The number of missed updates
4400 * @active: !0 for NOHZ_FULL
4401 *
4310 * Update rq->cpu_load[] statistics. This function is usually called every 4402 * Update rq->cpu_load[] statistics. This function is usually called every
4311 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 4403 * scheduler tick (TICK_NSEC).
4312 * every tick. We fix it up based on jiffies. 4404 *
4405 * This function computes a decaying average:
4406 *
4407 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4408 *
4409 * Because of NOHZ it might not get called on every tick which gives need for
4410 * the @pending_updates argument.
4411 *
4412 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4413 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4414 * = A * (A * load[i]_n-2 + B) + B
4415 * = A * (A * (A * load[i]_n-3 + B) + B) + B
4416 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4417 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4418 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4419 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4420 *
4421 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4422 * any change in load would have resulted in the tick being turned back on.
4423 *
4424 * For regular NOHZ, this reduces to:
4425 *
4426 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
4427 *
4428 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4429 * term. See the @active paramter.
4313 */ 4430 */
4314static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, 4431static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4315 unsigned long pending_updates) 4432 unsigned long pending_updates, int active)
4316{ 4433{
4434 unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
4317 int i, scale; 4435 int i, scale;
4318 4436
4319 this_rq->nr_load_updates++; 4437 this_rq->nr_load_updates++;
@@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4325 4443
4326 /* scale is effectively 1 << i now, and >> i divides by scale */ 4444 /* scale is effectively 1 << i now, and >> i divides by scale */
4327 4445
4328 old_load = this_rq->cpu_load[i]; 4446 old_load = this_rq->cpu_load[i] - tickless_load;
4329 old_load = decay_load_missed(old_load, pending_updates - 1, i); 4447 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4448 old_load += tickless_load;
4330 new_load = this_load; 4449 new_load = this_load;
4331 /* 4450 /*
4332 * Round up the averaging division if load is increasing. This 4451 * Round up the averaging division if load is increasing. This
@@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq)
4381 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4500 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4382 this_rq->last_load_update_tick = curr_jiffies; 4501 this_rq->last_load_update_tick = curr_jiffies;
4383 4502
4384 __update_cpu_load(this_rq, load, pending_updates); 4503 __update_cpu_load(this_rq, load, pending_updates, 0);
4385} 4504}
4386 4505
4387/* 4506/*
4388 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. 4507 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4389 */ 4508 */
4390void update_cpu_load_nohz(void) 4509void update_cpu_load_nohz(int active)
4391{ 4510{
4392 struct rq *this_rq = this_rq(); 4511 struct rq *this_rq = this_rq();
4393 unsigned long curr_jiffies = READ_ONCE(jiffies); 4512 unsigned long curr_jiffies = READ_ONCE(jiffies);
4513 unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
4394 unsigned long pending_updates; 4514 unsigned long pending_updates;
4395 4515
4396 if (curr_jiffies == this_rq->last_load_update_tick) 4516 if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void)
4401 if (pending_updates) { 4521 if (pending_updates) {
4402 this_rq->last_load_update_tick = curr_jiffies; 4522 this_rq->last_load_update_tick = curr_jiffies;
4403 /* 4523 /*
4404 * We were idle, this means load 0, the current load might be 4524 * In the regular NOHZ case, we were idle, this means load 0.
4405 * !0 due to remote wakeups and the sort. 4525 * In the NOHZ_FULL case, we were non-idle, we should consider
4526 * its weighted load.
4406 */ 4527 */
4407 __update_cpu_load(this_rq, 0, pending_updates); 4528 __update_cpu_load(this_rq, load, pending_updates, active);
4408 } 4529 }
4409 raw_spin_unlock(&this_rq->lock); 4530 raw_spin_unlock(&this_rq->lock);
4410} 4531}
@@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq)
4420 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 4541 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4421 */ 4542 */
4422 this_rq->last_load_update_tick = jiffies; 4543 this_rq->last_load_update_tick = jiffies;
4423 __update_cpu_load(this_rq, load, 1); 4544 __update_cpu_load(this_rq, load, 1, 1);
4424} 4545}
4425 4546
4426/* 4547/*
@@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5007/* 5128/*
5008 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 5129 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5009 * cfs_rq_of(p) references at time of call are still valid and identify the 5130 * cfs_rq_of(p) references at time of call are still valid and identify the
5010 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 5131 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5011 * other assumptions, including the state of rq->lock, should be made.
5012 */ 5132 */
5013static void migrate_task_rq_fair(struct task_struct *p) 5133static void migrate_task_rq_fair(struct task_struct *p)
5014{ 5134{
@@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
5721{ 5841{
5722 lockdep_assert_held(&env->src_rq->lock); 5842 lockdep_assert_held(&env->src_rq->lock);
5723 5843
5724 deactivate_task(env->src_rq, p, 0);
5725 p->on_rq = TASK_ON_RQ_MIGRATING; 5844 p->on_rq = TASK_ON_RQ_MIGRATING;
5845 deactivate_task(env->src_rq, p, 0);
5726 set_task_cpu(p, env->dst_cpu); 5846 set_task_cpu(p, env->dst_cpu);
5727} 5847}
5728 5848
@@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
5855 lockdep_assert_held(&rq->lock); 5975 lockdep_assert_held(&rq->lock);
5856 5976
5857 BUG_ON(task_rq(p) != rq); 5977 BUG_ON(task_rq(p) != rq);
5858 p->on_rq = TASK_ON_RQ_QUEUED;
5859 activate_task(rq, p, 0); 5978 activate_task(rq, p, 0);
5979 p->on_rq = TASK_ON_RQ_QUEUED;
5860 check_preempt_curr(rq, p, 0); 5980 check_preempt_curr(rq, p, 0);
5861} 5981}
5862 5982
@@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6302 bool *overload) 6422 bool *overload)
6303{ 6423{
6304 unsigned long load; 6424 unsigned long load;
6305 int i; 6425 int i, nr_running;
6306 6426
6307 memset(sgs, 0, sizeof(*sgs)); 6427 memset(sgs, 0, sizeof(*sgs));
6308 6428
@@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6319 sgs->group_util += cpu_util(i); 6439 sgs->group_util += cpu_util(i);
6320 sgs->sum_nr_running += rq->cfs.h_nr_running; 6440 sgs->sum_nr_running += rq->cfs.h_nr_running;
6321 6441
6322 if (rq->nr_running > 1) 6442 nr_running = rq->nr_running;
6443 if (nr_running > 1)
6323 *overload = true; 6444 *overload = true;
6324 6445
6325#ifdef CONFIG_NUMA_BALANCING 6446#ifdef CONFIG_NUMA_BALANCING
@@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6327 sgs->nr_preferred_running += rq->nr_preferred_running; 6448 sgs->nr_preferred_running += rq->nr_preferred_running;
6328#endif 6449#endif
6329 sgs->sum_weighted_load += weighted_cpuload(i); 6450 sgs->sum_weighted_load += weighted_cpuload(i);
6330 if (idle_cpu(i)) 6451 /*
6452 * No need to call idle_cpu() if nr_running is not 0
6453 */
6454 if (!nr_running && idle_cpu(i))
6331 sgs->idle_cpus++; 6455 sgs->idle_cpus++;
6332 } 6456 }
6333 6457
@@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq)
7248 int pulled_task = 0; 7372 int pulled_task = 0;
7249 u64 curr_cost = 0; 7373 u64 curr_cost = 0;
7250 7374
7251 idle_enter_fair(this_rq);
7252
7253 /* 7375 /*
7254 * We must set idle_stamp _before_ calling idle_balance(), such that we 7376 * We must set idle_stamp _before_ calling idle_balance(), such that we
7255 * measure the duration of idle_balance() as idle time. 7377 * measure the duration of idle_balance() as idle time.
@@ -7330,10 +7452,8 @@ out:
7330 if (this_rq->nr_running != this_rq->cfs.h_nr_running) 7452 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7331 pulled_task = -1; 7453 pulled_task = -1;
7332 7454
7333 if (pulled_task) { 7455 if (pulled_task)
7334 idle_exit_fair(this_rq);
7335 this_rq->idle_stamp = 0; 7456 this_rq->idle_stamp = 0;
7336 }
7337 7457
7338 return pulled_task; 7458 return pulled_task;
7339} 7459}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..47ce94931f1b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
47 47
48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
49{ 49{
50 idle_exit_fair(rq);
51 rq_last_tick_reset(rq); 50 rq_last_tick_reset(rq);
52} 51}
53 52
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1e0bb4afe3fd..10f16374df7f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,12 @@ struct task_group {
248 unsigned long shares; 248 unsigned long shares;
249 249
250#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
251 atomic_long_t load_avg; 251 /*
252 * load_avg can be heavily contended at clock tick time, so put
253 * it in its own cacheline separated from the fields above which
254 * will also be accessed at each tick.
255 */
256 atomic_long_t load_avg ____cacheline_aligned;
252#endif 257#endif
253#endif 258#endif
254 259
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
335 340
336#ifdef CONFIG_FAIR_GROUP_SCHED 341#ifdef CONFIG_FAIR_GROUP_SCHED
337extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 342extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
338#endif 343
344#ifdef CONFIG_SMP
345extern void set_task_rq_fair(struct sched_entity *se,
346 struct cfs_rq *prev, struct cfs_rq *next);
347#else /* !CONFIG_SMP */
348static inline void set_task_rq_fair(struct sched_entity *se,
349 struct cfs_rq *prev, struct cfs_rq *next) { }
350#endif /* CONFIG_SMP */
351#endif /* CONFIG_FAIR_GROUP_SCHED */
339 352
340#else /* CONFIG_CGROUP_SCHED */ 353#else /* CONFIG_CGROUP_SCHED */
341 354
@@ -933,6 +946,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
933#endif 946#endif
934 947
935#ifdef CONFIG_FAIR_GROUP_SCHED 948#ifdef CONFIG_FAIR_GROUP_SCHED
949 set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
936 p->se.cfs_rq = tg->cfs_rq[cpu]; 950 p->se.cfs_rq = tg->cfs_rq[cpu];
937 p->se.parent = tg->se[cpu]; 951 p->se.parent = tg->se[cpu];
938#endif 952#endif
@@ -1113,46 +1127,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1113#define WEIGHT_IDLEPRIO 3 1127#define WEIGHT_IDLEPRIO 3
1114#define WMULT_IDLEPRIO 1431655765 1128#define WMULT_IDLEPRIO 1431655765
1115 1129
1116/* 1130extern const int sched_prio_to_weight[40];
1117 * Nice levels are multiplicative, with a gentle 10% change for every 1131extern const u32 sched_prio_to_wmult[40];
1118 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1119 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1120 * that remained on nice 0.
1121 *
1122 * The "10% effect" is relative and cumulative: from _any_ nice level,
1123 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1124 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1125 * If a task goes up by ~10% and another task goes down by ~10% then
1126 * the relative distance between them is ~25%.)
1127 */
1128static const int prio_to_weight[40] = {
1129 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1130 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1131 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1132 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1133 /* 0 */ 1024, 820, 655, 526, 423,
1134 /* 5 */ 335, 272, 215, 172, 137,
1135 /* 10 */ 110, 87, 70, 56, 45,
1136 /* 15 */ 36, 29, 23, 18, 15,
1137};
1138
1139/*
1140 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1141 *
1142 * In cases where the weight does not change often, we can use the
1143 * precalculated inverse to speed up arithmetics by turning divisions
1144 * into multiplications:
1145 */
1146static const u32 prio_to_wmult[40] = {
1147 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1148 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1149 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1150 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1151 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1152 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1153 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1154 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1155};
1156 1132
1157#define ENQUEUE_WAKEUP 0x01 1133#define ENQUEUE_WAKEUP 0x01
1158#define ENQUEUE_HEAD 0x02 1134#define ENQUEUE_HEAD 0x02
@@ -1252,16 +1228,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
1252 1228
1253extern void trigger_load_balance(struct rq *rq); 1229extern void trigger_load_balance(struct rq *rq);
1254 1230
1255extern void idle_enter_fair(struct rq *this_rq);
1256extern void idle_exit_fair(struct rq *this_rq);
1257
1258extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); 1231extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
1259 1232
1260#else
1261
1262static inline void idle_enter_fair(struct rq *rq) { }
1263static inline void idle_exit_fair(struct rq *rq) { }
1264
1265#endif 1233#endif
1266 1234
1267#ifdef CONFIG_CPU_IDLE 1235#ifdef CONFIG_CPU_IDLE
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a3bbaee77c58..edb6de4f5908 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
28 */ 28 */
29struct cpu_stop_done { 29struct cpu_stop_done {
30 atomic_t nr_todo; /* nr left to execute */ 30 atomic_t nr_todo; /* nr left to execute */
31 bool executed; /* actually executed? */
32 int ret; /* collected return value */ 31 int ret; /* collected return value */
33 struct completion completion; /* fired if nr_todo reaches 0 */ 32 struct completion completion; /* fired if nr_todo reaches 0 */
34}; 33};
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
63} 62}
64 63
65/* signal completion unless @done is NULL */ 64/* signal completion unless @done is NULL */
66static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) 65static void cpu_stop_signal_done(struct cpu_stop_done *done)
67{ 66{
68 if (done) { 67 if (atomic_dec_and_test(&done->nr_todo))
69 if (executed) 68 complete(&done->completion);
70 done->executed = true;
71 if (atomic_dec_and_test(&done->nr_todo))
72 complete(&done->completion);
73 }
74} 69}
75 70
76static void __cpu_stop_queue_work(struct cpu_stopper *stopper, 71static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
81} 76}
82 77
83/* queue @work to @stopper. if offline, @work is completed immediately */ 78/* queue @work to @stopper. if offline, @work is completed immediately */
84static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) 79static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
85{ 80{
86 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 81 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
87 unsigned long flags; 82 unsigned long flags;
83 bool enabled;
88 84
89 spin_lock_irqsave(&stopper->lock, flags); 85 spin_lock_irqsave(&stopper->lock, flags);
90 if (stopper->enabled) 86 enabled = stopper->enabled;
87 if (enabled)
91 __cpu_stop_queue_work(stopper, work); 88 __cpu_stop_queue_work(stopper, work);
92 else 89 else if (work->done)
93 cpu_stop_signal_done(work->done, false); 90 cpu_stop_signal_done(work->done);
94 spin_unlock_irqrestore(&stopper->lock, flags); 91 spin_unlock_irqrestore(&stopper->lock, flags);
92
93 return enabled;
95} 94}
96 95
97/** 96/**
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
124 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 123 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
125 124
126 cpu_stop_init_done(&done, 1); 125 cpu_stop_init_done(&done, 1);
127 cpu_stop_queue_work(cpu, &work); 126 if (!cpu_stop_queue_work(cpu, &work))
127 return -ENOENT;
128 wait_for_completion(&done.completion); 128 wait_for_completion(&done.completion);
129 return done.executed ? done.ret : -ENOENT; 129 return done.ret;
130} 130}
131 131
132/* This controls the threads on each CPU. */ 132/* This controls the threads on each CPU. */
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
258 struct cpu_stop_work work1, work2; 258 struct cpu_stop_work work1, work2;
259 struct multi_stop_data msdata; 259 struct multi_stop_data msdata;
260 260
261 preempt_disable();
262 msdata = (struct multi_stop_data){ 261 msdata = (struct multi_stop_data){
263 .fn = fn, 262 .fn = fn,
264 .data = arg, 263 .data = arg,
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
277 276
278 if (cpu1 > cpu2) 277 if (cpu1 > cpu2)
279 swap(cpu1, cpu2); 278 swap(cpu1, cpu2);
280 if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { 279 if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
281 preempt_enable();
282 return -ENOENT; 280 return -ENOENT;
283 }
284
285 preempt_enable();
286 281
287 wait_for_completion(&done.completion); 282 wait_for_completion(&done.completion);
288 283 return done.ret;
289 return done.executed ? done.ret : -ENOENT;
290} 284}
291 285
292/** 286/**
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
302 * 296 *
303 * CONTEXT: 297 * CONTEXT:
304 * Don't care. 298 * Don't care.
299 *
300 * RETURNS:
301 * true if cpu_stop_work was queued successfully and @fn will be called,
302 * false otherwise.
305 */ 303 */
306void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 304bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
307 struct cpu_stop_work *work_buf) 305 struct cpu_stop_work *work_buf)
308{ 306{
309 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 307 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
310 cpu_stop_queue_work(cpu, work_buf); 308 return cpu_stop_queue_work(cpu, work_buf);
311} 309}
312 310
313/* static data for stop_cpus */ 311/* static data for stop_cpus */
314static DEFINE_MUTEX(stop_cpus_mutex); 312static DEFINE_MUTEX(stop_cpus_mutex);
315 313
316static void queue_stop_cpus_work(const struct cpumask *cpumask, 314static bool queue_stop_cpus_work(const struct cpumask *cpumask,
317 cpu_stop_fn_t fn, void *arg, 315 cpu_stop_fn_t fn, void *arg,
318 struct cpu_stop_done *done) 316 struct cpu_stop_done *done)
319{ 317{
320 struct cpu_stop_work *work; 318 struct cpu_stop_work *work;
321 unsigned int cpu; 319 unsigned int cpu;
320 bool queued = false;
322 321
323 /* 322 /*
324 * Disable preemption while queueing to avoid getting 323 * Disable preemption while queueing to avoid getting
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
331 work->fn = fn; 330 work->fn = fn;
332 work->arg = arg; 331 work->arg = arg;
333 work->done = done; 332 work->done = done;
334 cpu_stop_queue_work(cpu, work); 333 if (cpu_stop_queue_work(cpu, work))
334 queued = true;
335 } 335 }
336 lg_global_unlock(&stop_cpus_lock); 336 lg_global_unlock(&stop_cpus_lock);
337
338 return queued;
337} 339}
338 340
339static int __stop_cpus(const struct cpumask *cpumask, 341static int __stop_cpus(const struct cpumask *cpumask,
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask,
342 struct cpu_stop_done done; 344 struct cpu_stop_done done;
343 345
344 cpu_stop_init_done(&done, cpumask_weight(cpumask)); 346 cpu_stop_init_done(&done, cpumask_weight(cpumask));
345 queue_stop_cpus_work(cpumask, fn, arg, &done); 347 if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
348 return -ENOENT;
346 wait_for_completion(&done.completion); 349 wait_for_completion(&done.completion);
347 return done.executed ? done.ret : -ENOENT; 350 return done.ret;
348} 351}
349 352
350/** 353/**
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
432{ 435{
433 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 436 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
434 struct cpu_stop_work *work; 437 struct cpu_stop_work *work;
435 int ret;
436 438
437repeat: 439repeat:
438 work = NULL; 440 work = NULL;
@@ -448,23 +450,19 @@ repeat:
448 cpu_stop_fn_t fn = work->fn; 450 cpu_stop_fn_t fn = work->fn;
449 void *arg = work->arg; 451 void *arg = work->arg;
450 struct cpu_stop_done *done = work->done; 452 struct cpu_stop_done *done = work->done;
451 char ksym_buf[KSYM_NAME_LEN] __maybe_unused; 453 int ret;
452
453 /* cpu stop callbacks are not allowed to sleep */
454 preempt_disable();
455 454
455 /* cpu stop callbacks must not sleep, make in_atomic() == T */
456 preempt_count_inc();
456 ret = fn(arg); 457 ret = fn(arg);
457 if (ret) 458 if (done) {
458 done->ret = ret; 459 if (ret)
459 460 done->ret = ret;
460 /* restore preemption and check it's still balanced */ 461 cpu_stop_signal_done(done);
461 preempt_enable(); 462 }
463 preempt_count_dec();
462 WARN_ONCE(preempt_count(), 464 WARN_ONCE(preempt_count(),
463 "cpu_stop: %s(%p) leaked preempt count\n", 465 "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
464 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
465 ksym_buf), arg);
466
467 cpu_stop_signal_done(done, true);
468 goto repeat; 466 goto repeat;
469 } 467 }
470} 468}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec4515983..11ce59916c1a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -694,11 +694,11 @@ out:
694 return tick; 694 return tick;
695} 695}
696 696
697static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 697static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
698{ 698{
699 /* Update jiffies first */ 699 /* Update jiffies first */
700 tick_do_update_jiffies64(now); 700 tick_do_update_jiffies64(now);
701 update_cpu_load_nohz(); 701 update_cpu_load_nohz(active);
702 702
703 calc_load_exit_idle(); 703 calc_load_exit_idle();
704 touch_softlockup_watchdog(); 704 touch_softlockup_watchdog();
@@ -725,7 +725,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
725 if (can_stop_full_tick()) 725 if (can_stop_full_tick())
726 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 726 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
727 else if (ts->tick_stopped) 727 else if (ts->tick_stopped)
728 tick_nohz_restart_sched_tick(ts, ktime_get()); 728 tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
729#endif 729#endif
730} 730}
731 731
@@ -875,7 +875,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
875#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 875#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
876 unsigned long ticks; 876 unsigned long ticks;
877 877
878 if (vtime_accounting_enabled()) 878 if (vtime_accounting_cpu_enabled())
879 return; 879 return;
880 /* 880 /*
881 * We stopped the tick in idle. Update process times would miss the 881 * We stopped the tick in idle. Update process times would miss the
@@ -916,7 +916,7 @@ void tick_nohz_idle_exit(void)
916 tick_nohz_stop_idle(ts, now); 916 tick_nohz_stop_idle(ts, now);
917 917
918 if (ts->tick_stopped) { 918 if (ts->tick_stopped) {
919 tick_nohz_restart_sched_tick(ts, now); 919 tick_nohz_restart_sched_tick(ts, now, 0);
920 tick_nohz_account_idle_ticks(ts); 920 tick_nohz_account_idle_ticks(ts);
921 } 921 }
922 922