aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-25 16:59:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-25 16:59:34 -0400
commitcca08cd66ce6cc37812b6b36986ba7eaabd33e0b (patch)
treef68966cff4e888f51bd18497c358662c14cab6a2
parent7e4dc77b2869a683fc43c0394fca5441816390ba (diff)
parent748c7201e622d1c24abb4f85072d2e74d12f295f (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - introduce and use task_rcu_dereference()/try_get_task_struct() to fix and generalize task_struct handling (Oleg Nesterov) - do various per entity load tracking (PELT) fixes and optimizations (Peter Zijlstra) - cputime virt-steal time accounting enhancements/fixes (Wanpeng Li) - introduce consolidated cputime output file cpuacct.usage_all and related refactorings (Zhao Lei) - ... plus misc fixes and enhancements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show() sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums sched/fair: Rework throttle_count sync sched/core: Fix sched_getaffinity() return value kerneldoc comment sched/fair: Reorder cgroup creation code sched/fair: Apply more PELT fixes sched/fair: Fix PELT integrity for new tasks sched/cgroup: Fix cpu_cgroup_fork() handling sched/fair: Fix PELT integrity for new groups sched/fair: Fix and optimize the fork() path sched/cputime: Add steal time support to full dynticks CPU time accounting sched/cputime: Fix prev steal time accouting during CPU hotplug KVM: Fix steal clock warp during guest CPU hotplug sched/debug: Always show 'nr_migrations' sched/fair: Use task_rcu_dereference() sched/api: Introduce task_rcu_dereference() and try_get_task_struct() sched/idle: Optimize the generic idle loop sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task()
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/exit.c76
-rw-r--r--kernel/sched/core.c114
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c251
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/sched.h21
10 files changed, 418 insertions, 190 deletions
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index eea2a6f72b31..1ef5e48b3a36 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
301 if (!has_steal_clock) 301 if (!has_steal_clock)
302 return; 302 return;
303 303
304 memset(st, 0, sizeof(*st));
305
306 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); 304 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
307 pr_info("kvm-stealtime: cpu %d, msr %llx\n", 305 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
308 cpu, (unsigned long long) slow_virt_to_phys(st)); 306 cpu, (unsigned long long) slow_virt_to_phys(st));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f29ade..d99218a1e043 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
219#define TASK_WAKING 256 219#define TASK_WAKING 256
220#define TASK_PARKED 512 220#define TASK_PARKED 512
221#define TASK_NOLOAD 1024 221#define TASK_NOLOAD 1024
222#define TASK_STATE_MAX 2048 222#define TASK_NEW 2048
223#define TASK_STATE_MAX 4096
223 224
224#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" 225#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
225 226
226extern char ___assert_task_state[1 - 2*!!( 227extern char ___assert_task_state[1 - 2*!!(
227 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; 228 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t)
2139 __put_task_struct(t); 2140 __put_task_struct(t);
2140} 2141}
2141 2142
2143struct task_struct *task_rcu_dereference(struct task_struct **ptask);
2144struct task_struct *try_get_task_struct(struct task_struct **ptask);
2145
2142#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 2146#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2143extern void task_cputime(struct task_struct *t, 2147extern void task_cputime(struct task_struct *t,
2144 cputime_t *utime, cputime_t *stime); 2148 cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c
index 0b40791b9e70..84ae830234f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
211} 211}
212 212
213/* 213/*
214 * Note that if this function returns a valid task_struct pointer (!NULL)
215 * task->usage must remain >0 for the duration of the RCU critical section.
216 */
217struct task_struct *task_rcu_dereference(struct task_struct **ptask)
218{
219 struct sighand_struct *sighand;
220 struct task_struct *task;
221
222 /*
223 * We need to verify that release_task() was not called and thus
224 * delayed_put_task_struct() can't run and drop the last reference
225 * before rcu_read_unlock(). We check task->sighand != NULL,
226 * but we can read the already freed and reused memory.
227 */
228retry:
229 task = rcu_dereference(*ptask);
230 if (!task)
231 return NULL;
232
233 probe_kernel_address(&task->sighand, sighand);
234
235 /*
236 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
237 * was already freed we can not miss the preceding update of this
238 * pointer.
239 */
240 smp_rmb();
241 if (unlikely(task != READ_ONCE(*ptask)))
242 goto retry;
243
244 /*
245 * We've re-checked that "task == *ptask", now we have two different
246 * cases:
247 *
248 * 1. This is actually the same task/task_struct. In this case
249 * sighand != NULL tells us it is still alive.
250 *
251 * 2. This is another task which got the same memory for task_struct.
252 * We can't know this of course, and we can not trust
253 * sighand != NULL.
254 *
255 * In this case we actually return a random value, but this is
256 * correct.
257 *
258 * If we return NULL - we can pretend that we actually noticed that
259 * *ptask was updated when the previous task has exited. Or pretend
260 * that probe_slab_address(&sighand) reads NULL.
261 *
262 * If we return the new task (because sighand is not NULL for any
263 * reason) - this is fine too. This (new) task can't go away before
264 * another gp pass.
265 *
266 * And note: We could even eliminate the false positive if re-read
267 * task->sighand once again to avoid the falsely NULL. But this case
268 * is very unlikely so we don't care.
269 */
270 if (!sighand)
271 return NULL;
272
273 return task;
274}
275
276struct task_struct *try_get_task_struct(struct task_struct **ptask)
277{
278 struct task_struct *task;
279
280 rcu_read_lock();
281 task = task_rcu_dereference(ptask);
282 if (task)
283 get_task_struct(task);
284 rcu_read_unlock();
285
286 return task;
287}
288
289/*
214 * Determine if a process group is "orphaned", according to the POSIX 290 * Determine if a process group is "orphaned", according to the POSIX
215 * definition in 2.2.2.52. Orphaned process groups are not to be affected 291 * definition in 2.2.2.52. Orphaned process groups are not to be affected
216 * by terminal-generated stop signals. Newly orphaned process groups are 292 * by terminal-generated stop signals. Newly orphaned process groups are
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index af0ef74df23c..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2342 2342
2343 __sched_fork(clone_flags, p); 2343 __sched_fork(clone_flags, p);
2344 /* 2344 /*
2345 * We mark the process as running here. This guarantees that 2345 * We mark the process as NEW here. This guarantees that
2346 * nobody will actually run it, and a signal or other external 2346 * nobody will actually run it, and a signal or other external
2347 * event cannot wake it up and insert it on the runqueue either. 2347 * event cannot wake it up and insert it on the runqueue either.
2348 */ 2348 */
2349 p->state = TASK_RUNNING; 2349 p->state = TASK_NEW;
2350 2350
2351 /* 2351 /*
2352 * Make sure we do not leak PI boosting priority to the child. 2352 * Make sure we do not leak PI boosting priority to the child.
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2383 p->sched_class = &fair_sched_class; 2383 p->sched_class = &fair_sched_class;
2384 } 2384 }
2385 2385
2386 if (p->sched_class->task_fork) 2386 init_entity_runnable_average(&p->se);
2387 p->sched_class->task_fork(p);
2388 2387
2389 /* 2388 /*
2390 * The child is not yet in the pid-hash so no cgroup attach races, 2389 * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2394 * Silence PROVE_RCU. 2393 * Silence PROVE_RCU.
2395 */ 2394 */
2396 raw_spin_lock_irqsave(&p->pi_lock, flags); 2395 raw_spin_lock_irqsave(&p->pi_lock, flags);
2397 set_task_cpu(p, cpu); 2396 /*
2397 * We're setting the cpu for the first time, we don't migrate,
2398 * so use __set_task_cpu().
2399 */
2400 __set_task_cpu(p, cpu);
2401 if (p->sched_class->task_fork)
2402 p->sched_class->task_fork(p);
2398 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2403 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2399 2404
2400#ifdef CONFIG_SCHED_INFO 2405#ifdef CONFIG_SCHED_INFO
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
2526 struct rq_flags rf; 2531 struct rq_flags rf;
2527 struct rq *rq; 2532 struct rq *rq;
2528 2533
2529 /* Initialize new task's runnable average */
2530 init_entity_runnable_average(&p->se);
2531 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2534 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2535 p->state = TASK_RUNNING;
2532#ifdef CONFIG_SMP 2536#ifdef CONFIG_SMP
2533 /* 2537 /*
2534 * Fork balancing, do it here and not earlier because: 2538 * Fork balancing, do it here and not earlier because:
2535 * - cpus_allowed can change in the fork path 2539 * - cpus_allowed can change in the fork path
2536 * - any previously selected cpu might disappear through hotplug 2540 * - any previously selected cpu might disappear through hotplug
2541 *
2542 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2543 * as we're not fully set-up yet.
2537 */ 2544 */
2538 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2545 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2539#endif 2546#endif
2540 rq = __task_rq_lock(p, &rf); 2547 rq = __task_rq_lock(p, &rf);
2541 post_init_entity_util_avg(&p->se); 2548 post_init_entity_util_avg(&p->se);
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
3161 pr_cont("\n"); 3168 pr_cont("\n");
3162 } 3169 }
3163#endif 3170#endif
3171 if (panic_on_warn)
3172 panic("scheduling while atomic\n");
3173
3164 dump_stack(); 3174 dump_stack();
3165 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3175 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3166} 3176}
@@ -4752,7 +4762,8 @@ out_unlock:
4752 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4762 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4753 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4763 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4754 * 4764 *
4755 * Return: 0 on success. An error code otherwise. 4765 * Return: size of CPU mask copied to user_mask_ptr on success. An
4766 * error code otherwise.
4756 */ 4767 */
4757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4768SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4758 unsigned long __user *, user_mask_ptr) 4769 unsigned long __user *, user_mask_ptr)
@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
7233 struct rq *rq = cpu_rq(cpu); 7244 struct rq *rq = cpu_rq(cpu);
7234 7245
7235 rq->calc_load_update = calc_load_update; 7246 rq->calc_load_update = calc_load_update;
7236 account_reset_rq(rq);
7237 update_max_interval(); 7247 update_max_interval();
7238} 7248}
7239 7249
@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
7713 INIT_LIST_HEAD(&tg->children); 7723 INIT_LIST_HEAD(&tg->children);
7714 list_add_rcu(&tg->siblings, &parent->children); 7724 list_add_rcu(&tg->siblings, &parent->children);
7715 spin_unlock_irqrestore(&task_group_lock, flags); 7725 spin_unlock_irqrestore(&task_group_lock, flags);
7726
7727 online_fair_sched_group(tg);
7716} 7728}
7717 7729
7718/* rcu callback to free various structures associated with a task group */ 7730/* rcu callback to free various structures associated with a task group */
@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
7741 spin_unlock_irqrestore(&task_group_lock, flags); 7753 spin_unlock_irqrestore(&task_group_lock, flags);
7742} 7754}
7743 7755
7744/* change task's runqueue when it moves between groups. 7756static void sched_change_group(struct task_struct *tsk, int type)
7745 * The caller of this function should have put the task in its new group
7746 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7747 * reflect its new group.
7748 */
7749void sched_move_task(struct task_struct *tsk)
7750{ 7757{
7751 struct task_group *tg; 7758 struct task_group *tg;
7752 int queued, running;
7753 struct rq_flags rf;
7754 struct rq *rq;
7755
7756 rq = task_rq_lock(tsk, &rf);
7757
7758 running = task_current(rq, tsk);
7759 queued = task_on_rq_queued(tsk);
7760
7761 if (queued)
7762 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7763 if (unlikely(running))
7764 put_prev_task(rq, tsk);
7765 7759
7766 /* 7760 /*
7767 * All callers are synchronized by task_rq_lock(); we do not use RCU 7761 * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
7774 tsk->sched_task_group = tg; 7768 tsk->sched_task_group = tg;
7775 7769
7776#ifdef CONFIG_FAIR_GROUP_SCHED 7770#ifdef CONFIG_FAIR_GROUP_SCHED
7777 if (tsk->sched_class->task_move_group) 7771 if (tsk->sched_class->task_change_group)
7778 tsk->sched_class->task_move_group(tsk); 7772 tsk->sched_class->task_change_group(tsk, type);
7779 else 7773 else
7780#endif 7774#endif
7781 set_task_rq(tsk, task_cpu(tsk)); 7775 set_task_rq(tsk, task_cpu(tsk));
7776}
7777
7778/*
7779 * Change task's runqueue when it moves between groups.
7780 *
7781 * The caller of this function should have put the task in its new group by
7782 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
7783 * its new group.
7784 */
7785void sched_move_task(struct task_struct *tsk)
7786{
7787 int queued, running;
7788 struct rq_flags rf;
7789 struct rq *rq;
7790
7791 rq = task_rq_lock(tsk, &rf);
7792
7793 running = task_current(rq, tsk);
7794 queued = task_on_rq_queued(tsk);
7795
7796 if (queued)
7797 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7798 if (unlikely(running))
7799 put_prev_task(rq, tsk);
7800
7801 sched_change_group(tsk, TASK_MOVE_GROUP);
7782 7802
7783 if (unlikely(running)) 7803 if (unlikely(running))
7784 tsk->sched_class->set_curr_task(rq); 7804 tsk->sched_class->set_curr_task(rq);
@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8206 sched_free_group(tg); 8226 sched_free_group(tg);
8207} 8227}
8208 8228
8229/*
8230 * This is called before wake_up_new_task(), therefore we really only
8231 * have to set its group bits, all the other stuff does not apply.
8232 */
8209static void cpu_cgroup_fork(struct task_struct *task) 8233static void cpu_cgroup_fork(struct task_struct *task)
8210{ 8234{
8211 sched_move_task(task); 8235 struct rq_flags rf;
8236 struct rq *rq;
8237
8238 rq = task_rq_lock(task, &rf);
8239
8240 sched_change_group(task, TASK_SET_GROUP);
8241
8242 task_rq_unlock(rq, task, &rf);
8212} 8243}
8213 8244
8214static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 8245static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8215{ 8246{
8216 struct task_struct *task; 8247 struct task_struct *task;
8217 struct cgroup_subsys_state *css; 8248 struct cgroup_subsys_state *css;
8249 int ret = 0;
8218 8250
8219 cgroup_taskset_for_each(task, css, tset) { 8251 cgroup_taskset_for_each(task, css, tset) {
8220#ifdef CONFIG_RT_GROUP_SCHED 8252#ifdef CONFIG_RT_GROUP_SCHED
@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8225 if (task->sched_class != &fair_sched_class) 8257 if (task->sched_class != &fair_sched_class)
8226 return -EINVAL; 8258 return -EINVAL;
8227#endif 8259#endif
8260 /*
8261 * Serialize against wake_up_new_task() such that if its
8262 * running, we're sure to observe its full state.
8263 */
8264 raw_spin_lock_irq(&task->pi_lock);
8265 /*
8266 * Avoid calling sched_move_task() before wake_up_new_task()
8267 * has happened. This would lead to problems with PELT, due to
8268 * move wanting to detach+attach while we're not attached yet.
8269 */
8270 if (task->state == TASK_NEW)
8271 ret = -EINVAL;
8272 raw_spin_unlock_irq(&task->pi_lock);
8273
8274 if (ret)
8275 break;
8228 } 8276 }
8229 return 0; 8277 return ret;
8230} 8278}
8231 8279
8232static void cpu_cgroup_attach(struct cgroup_taskset *tset) 8280static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
25 CPUACCT_STAT_NSTATS, 25 CPUACCT_STAT_NSTATS,
26}; 26};
27 27
28enum cpuacct_usage_index { 28static const char * const cpuacct_stat_desc[] = {
29 CPUACCT_USAGE_USER, /* ... user mode */ 29 [CPUACCT_STAT_USER] = "user",
30 CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ 30 [CPUACCT_STAT_SYSTEM] = "system",
31
32 CPUACCT_USAGE_NRUSAGE,
33}; 31};
34 32
35struct cpuacct_usage { 33struct cpuacct_usage {
36 u64 usages[CPUACCT_USAGE_NRUSAGE]; 34 u64 usages[CPUACCT_STAT_NSTATS];
37}; 35};
38 36
39/* track cpu usage of a group of tasks and its child groups */ 37/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
108} 106}
109 107
110static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, 108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
111 enum cpuacct_usage_index index) 109 enum cpuacct_stat_index index)
112{ 110{
113 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 111 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
114 u64 data; 112 u64 data;
115 113
116 /* 114 /*
117 * We allow index == CPUACCT_USAGE_NRUSAGE here to read 115 * We allow index == CPUACCT_STAT_NSTATS here to read
118 * the sum of suages. 116 * the sum of suages.
119 */ 117 */
120 BUG_ON(index > CPUACCT_USAGE_NRUSAGE); 118 BUG_ON(index > CPUACCT_STAT_NSTATS);
121 119
122#ifndef CONFIG_64BIT 120#ifndef CONFIG_64BIT
123 /* 121 /*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
126 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 124 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
127#endif 125#endif
128 126
129 if (index == CPUACCT_USAGE_NRUSAGE) { 127 if (index == CPUACCT_STAT_NSTATS) {
130 int i = 0; 128 int i = 0;
131 129
132 data = 0; 130 data = 0;
133 for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) 131 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
134 data += cpuusage->usages[i]; 132 data += cpuusage->usages[i];
135 } else { 133 } else {
136 data = cpuusage->usages[index]; 134 data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
155 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 153 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
156#endif 154#endif
157 155
158 for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) 156 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
159 cpuusage->usages[i] = val; 157 cpuusage->usages[i] = val;
160 158
161#ifndef CONFIG_64BIT 159#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
165 163
166/* return total cpu usage (in nanoseconds) of a group */ 164/* return total cpu usage (in nanoseconds) of a group */
167static u64 __cpuusage_read(struct cgroup_subsys_state *css, 165static u64 __cpuusage_read(struct cgroup_subsys_state *css,
168 enum cpuacct_usage_index index) 166 enum cpuacct_stat_index index)
169{ 167{
170 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(css);
171 u64 totalcpuusage = 0; 169 u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
180static u64 cpuusage_user_read(struct cgroup_subsys_state *css, 178static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
181 struct cftype *cft) 179 struct cftype *cft)
182{ 180{
183 return __cpuusage_read(css, CPUACCT_USAGE_USER); 181 return __cpuusage_read(css, CPUACCT_STAT_USER);
184} 182}
185 183
186static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, 184static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
187 struct cftype *cft) 185 struct cftype *cft)
188{ 186{
189 return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); 187 return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
190} 188}
191 189
192static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) 190static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
193{ 191{
194 return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); 192 return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
195} 193}
196 194
197static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, 195static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
213} 211}
214 212
215static int __cpuacct_percpu_seq_show(struct seq_file *m, 213static int __cpuacct_percpu_seq_show(struct seq_file *m,
216 enum cpuacct_usage_index index) 214 enum cpuacct_stat_index index)
217{ 215{
218 struct cpuacct *ca = css_ca(seq_css(m)); 216 struct cpuacct *ca = css_ca(seq_css(m));
219 u64 percpu; 217 u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
229 227
230static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) 228static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
231{ 229{
232 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); 230 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
233} 231}
234 232
235static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) 233static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
236{ 234{
237 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); 235 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
238} 236}
239 237
240static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 238static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
241{ 239{
242 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); 240 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
243} 241}
244 242
245static const char * const cpuacct_stat_desc[] = { 243static int cpuacct_all_seq_show(struct seq_file *m, void *V)
246 [CPUACCT_STAT_USER] = "user", 244{
247 [CPUACCT_STAT_SYSTEM] = "system", 245 struct cpuacct *ca = css_ca(seq_css(m));
248}; 246 int index;
247 int cpu;
248
249 seq_puts(m, "cpu");
250 for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
251 seq_printf(m, " %s", cpuacct_stat_desc[index]);
252 seq_puts(m, "\n");
253
254 for_each_possible_cpu(cpu) {
255 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
256
257 seq_printf(m, "%d", cpu);
258
259 for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
260#ifndef CONFIG_64BIT
261 /*
262 * Take rq->lock to make 64-bit read safe on 32-bit
263 * platforms.
264 */
265 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
266#endif
267
268 seq_printf(m, " %llu", cpuusage->usages[index]);
269
270#ifndef CONFIG_64BIT
271 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
272#endif
273 }
274 seq_puts(m, "\n");
275 }
276 return 0;
277}
249 278
250static int cpuacct_stats_show(struct seq_file *sf, void *v) 279static int cpuacct_stats_show(struct seq_file *sf, void *v)
251{ 280{
252 struct cpuacct *ca = css_ca(seq_css(sf)); 281 struct cpuacct *ca = css_ca(seq_css(sf));
282 s64 val[CPUACCT_STAT_NSTATS];
253 int cpu; 283 int cpu;
254 s64 val = 0; 284 int stat;
255 285
286 memset(val, 0, sizeof(val));
256 for_each_possible_cpu(cpu) { 287 for_each_possible_cpu(cpu) {
257 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 288 u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
258 val += kcpustat->cpustat[CPUTIME_USER];
259 val += kcpustat->cpustat[CPUTIME_NICE];
260 }
261 val = cputime64_to_clock_t(val);
262 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
263 289
264 val = 0; 290 val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
265 for_each_possible_cpu(cpu) { 291 val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
266 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 292 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
267 val += kcpustat->cpustat[CPUTIME_SYSTEM]; 293 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
268 val += kcpustat->cpustat[CPUTIME_IRQ]; 294 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
269 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
270 } 295 }
271 296
272 val = cputime64_to_clock_t(val); 297 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
273 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 298 seq_printf(sf, "%s %lld\n",
299 cpuacct_stat_desc[stat],
300 cputime64_to_clock_t(val[stat]));
301 }
274 302
275 return 0; 303 return 0;
276} 304}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
302 .seq_show = cpuacct_percpu_sys_seq_show, 330 .seq_show = cpuacct_percpu_sys_seq_show,
303 }, 331 },
304 { 332 {
333 .name = "usage_all",
334 .seq_show = cpuacct_all_seq_show,
335 },
336 {
305 .name = "stat", 337 .name = "stat",
306 .seq_show = cpuacct_stats_show, 338 .seq_show = cpuacct_stats_show,
307 }, 339 },
@@ -316,11 +348,11 @@ static struct cftype files[] = {
316void cpuacct_charge(struct task_struct *tsk, u64 cputime) 348void cpuacct_charge(struct task_struct *tsk, u64 cputime)
317{ 349{
318 struct cpuacct *ca; 350 struct cpuacct *ca;
319 int index = CPUACCT_USAGE_SYSTEM; 351 int index = CPUACCT_STAT_SYSTEM;
320 struct pt_regs *regs = task_pt_regs(tsk); 352 struct pt_regs *regs = task_pt_regs(tsk);
321 353
322 if (regs && user_mode(regs)) 354 if (regs && user_mode(regs))
323 index = CPUACCT_USAGE_USER; 355 index = CPUACCT_STAT_USER;
324 356
325 rcu_read_lock(); 357 rcu_read_lock();
326 358
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..3d60e5d76fdb 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
257 cpustat[CPUTIME_IDLE] += (__force u64) cputime; 257 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
258} 258}
259 259
260static __always_inline bool steal_account_process_tick(void) 260static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
261{ 261{
262#ifdef CONFIG_PARAVIRT 262#ifdef CONFIG_PARAVIRT
263 if (static_key_false(&paravirt_steal_enabled)) { 263 if (static_key_false(&paravirt_steal_enabled)) {
@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
272 * time in jiffies. Lets cast the result to jiffies 272 * time in jiffies. Lets cast the result to jiffies
273 * granularity and account the rest on the next rounds. 273 * granularity and account the rest on the next rounds.
274 */ 274 */
275 steal_jiffies = nsecs_to_jiffies(steal); 275 steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
276 this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); 276 this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
277 277
278 account_steal_time(jiffies_to_cputime(steal_jiffies)); 278 account_steal_time(jiffies_to_cputime(steal_jiffies));
279 return steal_jiffies; 279 return steal_jiffies;
280 } 280 }
281#endif 281#endif
282 return false; 282 return 0;
283} 283}
284 284
285/* 285/*
@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
346 u64 cputime = (__force u64) cputime_one_jiffy; 346 u64 cputime = (__force u64) cputime_one_jiffy;
347 u64 *cpustat = kcpustat_this_cpu->cpustat; 347 u64 *cpustat = kcpustat_this_cpu->cpustat;
348 348
349 if (steal_account_process_tick()) 349 if (steal_account_process_tick(ULONG_MAX))
350 return; 350 return;
351 351
352 cputime *= ticks; 352 cputime *= ticks;
@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
477 return; 477 return;
478 } 478 }
479 479
480 if (steal_account_process_tick()) 480 if (steal_account_process_tick(ULONG_MAX))
481 return; 481 return;
482 482
483 if (user_tick) 483 if (user_tick)
@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
681static cputime_t get_vtime_delta(struct task_struct *tsk) 681static cputime_t get_vtime_delta(struct task_struct *tsk)
682{ 682{
683 unsigned long now = READ_ONCE(jiffies); 683 unsigned long now = READ_ONCE(jiffies);
684 unsigned long delta = now - tsk->vtime_snap; 684 unsigned long delta_jiffies, steal_jiffies;
685 685
686 delta_jiffies = now - tsk->vtime_snap;
687 steal_jiffies = steal_account_process_tick(delta_jiffies);
686 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 688 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
687 tsk->vtime_snap = now; 689 tsk->vtime_snap = now;
688 690
689 return jiffies_to_cputime(delta); 691 return jiffies_to_cputime(delta_jiffies - steal_jiffies);
690} 692}
691 693
692static void __vtime_account_system(struct task_struct *tsk) 694static void __vtime_account_system(struct task_struct *tsk)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c393a336..2a0a9995256d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
879 879
880 nr_switches = p->nvcsw + p->nivcsw; 880 nr_switches = p->nvcsw + p->nivcsw;
881 881
882#ifdef CONFIG_SCHEDSTATS
883 P(se.nr_migrations); 882 P(se.nr_migrations);
884 883
884#ifdef CONFIG_SCHEDSTATS
885 if (schedstat_enabled()) { 885 if (schedstat_enabled()) {
886 u64 avg_atom, avg_per_cpu; 886 u64 avg_atom, avg_per_cpu;
887 887
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8c5d2d48424..4088eedea763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
691} 691}
692 692
693static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
694static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
695static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
696static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
697
693/* 698/*
694 * With new tasks being created, their initial util_avgs are extrapolated 699 * With new tasks being created, their initial util_avgs are extrapolated
695 * based on the cfs_rq's current util_avg: 700 * based on the cfs_rq's current util_avg:
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
720 struct cfs_rq *cfs_rq = cfs_rq_of(se); 725 struct cfs_rq *cfs_rq = cfs_rq_of(se);
721 struct sched_avg *sa = &se->avg; 726 struct sched_avg *sa = &se->avg;
722 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
723 730
724 if (cap > 0) { 731 if (cap > 0) {
725 if (cfs_rq->avg.util_avg != 0) { 732 if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
733 } 740 }
734 sa->util_sum = sa->util_avg * LOAD_AVG_MAX; 741 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
735 } 742 }
743
744 if (entity_is_task(se)) {
745 struct task_struct *p = task_of(se);
746 if (p->sched_class != &fair_sched_class) {
747 /*
748 * For !fair tasks do:
749 *
750 update_cfs_rq_load_avg(now, cfs_rq, false);
751 attach_entity_load_avg(cfs_rq, se);
752 switched_from_fair(rq, p);
753 *
754 * such that the next switched_to_fair() has the
755 * expected state.
756 */
757 se->avg.last_update_time = now;
758 return;
759 }
760 }
761
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update)
765 update_tg_load_avg(cfs_rq, false);
736} 766}
737 767
738#else 768#else /* !CONFIG_SMP */
739void init_entity_runnable_average(struct sched_entity *se) 769void init_entity_runnable_average(struct sched_entity *se)
740{ 770{
741} 771}
742void post_init_entity_util_avg(struct sched_entity *se) 772void post_init_entity_util_avg(struct sched_entity *se)
743{ 773{
744} 774}
745#endif 775static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
776{
777}
778#endif /* CONFIG_SMP */
746 779
747/* 780/*
748 * Update the current task's runtime statistics. 781 * Update the current task's runtime statistics.
@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
1303{ 1336{
1304 if (env->best_task) 1337 if (env->best_task)
1305 put_task_struct(env->best_task); 1338 put_task_struct(env->best_task);
1339 if (p)
1340 get_task_struct(p);
1306 1341
1307 env->best_task = p; 1342 env->best_task = p;
1308 env->best_imp = imp; 1343 env->best_imp = imp;
@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
1370 long imp = env->p->numa_group ? groupimp : taskimp; 1405 long imp = env->p->numa_group ? groupimp : taskimp;
1371 long moveimp = imp; 1406 long moveimp = imp;
1372 int dist = env->dist; 1407 int dist = env->dist;
1373 bool assigned = false;
1374 1408
1375 rcu_read_lock(); 1409 rcu_read_lock();
1376 1410 cur = task_rcu_dereference(&dst_rq->curr);
1377 raw_spin_lock_irq(&dst_rq->lock); 1411 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1378 cur = dst_rq->curr;
1379 /*
1380 * No need to move the exiting task or idle task.
1381 */
1382 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1383 cur = NULL; 1412 cur = NULL;
1384 else {
1385 /*
1386 * The task_struct must be protected here to protect the
1387 * p->numa_faults access in the task_weight since the
1388 * numa_faults could already be freed in the following path:
1389 * finish_task_switch()
1390 * --> put_task_struct()
1391 * --> __put_task_struct()
1392 * --> task_numa_free()
1393 */
1394 get_task_struct(cur);
1395 }
1396
1397 raw_spin_unlock_irq(&dst_rq->lock);
1398 1413
1399 /* 1414 /*
1400 * Because we have preemption enabled we can get migrated around and 1415 * Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1492,6 @@ balance:
1477 */ 1492 */
1478 if (!load_too_imbalanced(src_load, dst_load, env)) { 1493 if (!load_too_imbalanced(src_load, dst_load, env)) {
1479 imp = moveimp - 1; 1494 imp = moveimp - 1;
1480 put_task_struct(cur);
1481 cur = NULL; 1495 cur = NULL;
1482 goto assign; 1496 goto assign;
1483 } 1497 }
@@ -1503,16 +1517,9 @@ balance:
1503 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); 1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1504 1518
1505assign: 1519assign:
1506 assigned = true;
1507 task_numa_assign(env, cur, imp); 1520 task_numa_assign(env, cur, imp);
1508unlock: 1521unlock:
1509 rcu_read_unlock(); 1522 rcu_read_unlock();
1510 /*
1511 * The dst_rq->curr isn't assigned. The protection for task_struct is
1512 * finished.
1513 */
1514 if (cur && !assigned)
1515 put_task_struct(cur);
1516} 1523}
1517 1524
1518static void task_numa_find_cpu(struct task_numa_env *env, 1525static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,
2866static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 2873static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2867#endif /* CONFIG_FAIR_GROUP_SCHED */ 2874#endif /* CONFIG_FAIR_GROUP_SCHED */
2868 2875
2869static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2870
2871static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2872{ 2877{
2873 struct rq *rq = rq_of(cfs_rq); 2878 struct rq *rq = rq_of(cfs_rq);
@@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2914 WRITE_ONCE(*ptr, res); \ 2919 WRITE_ONCE(*ptr, res); \
2915} while (0) 2920} while (0)
2916 2921
2917/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2922/**
2923 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
2924 * @now: current time, as per cfs_rq_clock_task()
2925 * @cfs_rq: cfs_rq to update
2926 * @update_freq: should we call cfs_rq_util_change() or will the call do so
2927 *
2928 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
2929 * avg. The immediate corollary is that all (fair) tasks must be attached, see
2930 * post_init_entity_util_avg().
2931 *
2932 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2933 *
2934 * Returns true if the load decayed or we removed utilization. It is expected
2935 * that one calls update_tg_load_avg() on this condition, but after you've
2936 * modified the cfs_rq avg (attach/detach), such that we propagate the new
2937 * avg up.
2938 */
2918static inline int 2939static inline int
2919update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2920{ 2941{
@@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
2969 update_tg_load_avg(cfs_rq, 0); 2990 update_tg_load_avg(cfs_rq, 0);
2970} 2991}
2971 2992
2993/**
2994 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
2995 * @cfs_rq: cfs_rq to attach to
2996 * @se: sched_entity to attach
2997 *
2998 * Must call update_cfs_rq_load_avg() before this, since we rely on
2999 * cfs_rq->avg.last_update_time being current.
3000 */
2972static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3001static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2973{ 3002{
2974 if (!sched_feat(ATTACH_AGE_LOAD)) 3003 if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
2977 /* 3006 /*
2978 * If we got migrated (either between CPUs or between cgroups) we'll 3007 * If we got migrated (either between CPUs or between cgroups) we'll
2979 * have aged the average right before clearing @last_update_time. 3008 * have aged the average right before clearing @last_update_time.
3009 *
3010 * Or we're fresh through post_init_entity_util_avg().
2980 */ 3011 */
2981 if (se->avg.last_update_time) { 3012 if (se->avg.last_update_time) {
2982 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), 3013 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2998,6 +3029,14 @@ skip_aging:
2998 cfs_rq_util_change(cfs_rq); 3029 cfs_rq_util_change(cfs_rq);
2999} 3030}
3000 3031
3032/**
3033 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3034 * @cfs_rq: cfs_rq to detach from
3035 * @se: sched_entity to detach
3036 *
3037 * Must call update_cfs_rq_load_avg() before this, since we rely on
3038 * cfs_rq->avg.last_update_time being current.
3039 */
3001static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3040static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3002{ 3041{
3003 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), 3042 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
3082 u64 last_update_time; 3121 u64 last_update_time;
3083 3122
3084 /* 3123 /*
3085 * Newly created task or never used group entity should not be removed 3124 * tasks cannot exit without having gone through wake_up_new_task() ->
3086 * from its (source) cfs_rq 3125 * post_init_entity_util_avg() which will have added things to the
3126 * cfs_rq, so we can remove unconditionally.
3127 *
3128 * Similarly for groups, they will have passed through
3129 * post_init_entity_util_avg() before unregister_sched_fair_group()
3130 * calls this.
3087 */ 3131 */
3088 if (se->avg.last_update_time == 0)
3089 return;
3090 3132
3091 last_update_time = cfs_rq_last_update_time(cfs_rq); 3133 last_update_time = cfs_rq_last_update_time(cfs_rq);
3092 3134
@@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
3109 3151
3110#else /* CONFIG_SMP */ 3152#else /* CONFIG_SMP */
3111 3153
3154static inline int
3155update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3156{
3157 return 0;
3158}
3159
3112static inline void update_load_avg(struct sched_entity *se, int not_used) 3160static inline void update_load_avg(struct sched_entity *se, int not_used)
3113{ 3161{
3114 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3162 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3698static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 3746static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3699{ 3747{
3700 if (unlikely(cfs_rq->throttle_count)) 3748 if (unlikely(cfs_rq->throttle_count))
3701 return cfs_rq->throttled_clock_task; 3749 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
3702 3750
3703 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; 3751 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3704} 3752}
@@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
3836 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 3884 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3837 3885
3838 cfs_rq->throttle_count--; 3886 cfs_rq->throttle_count--;
3839#ifdef CONFIG_SMP
3840 if (!cfs_rq->throttle_count) { 3887 if (!cfs_rq->throttle_count) {
3841 /* adjust cfs_rq_clock_task() */ 3888 /* adjust cfs_rq_clock_task() */
3842 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 3889 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3843 cfs_rq->throttled_clock_task; 3890 cfs_rq->throttled_clock_task;
3844 } 3891 }
3845#endif
3846 3892
3847 return 0; 3893 return 0;
3848} 3894}
@@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4195 if (!cfs_bandwidth_used()) 4241 if (!cfs_bandwidth_used())
4196 return; 4242 return;
4197 4243
4198 /* Synchronize hierarchical throttle counter: */
4199 if (unlikely(!cfs_rq->throttle_uptodate)) {
4200 struct rq *rq = rq_of(cfs_rq);
4201 struct cfs_rq *pcfs_rq;
4202 struct task_group *tg;
4203
4204 cfs_rq->throttle_uptodate = 1;
4205
4206 /* Get closest up-to-date node, because leaves go first: */
4207 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
4208 pcfs_rq = tg->cfs_rq[cpu_of(rq)];
4209 if (pcfs_rq->throttle_uptodate)
4210 break;
4211 }
4212 if (tg) {
4213 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4214 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4215 }
4216 }
4217
4218 /* an active group must be handled by the update_curr()->put() path */ 4244 /* an active group must be handled by the update_curr()->put() path */
4219 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 4245 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4220 return; 4246 return;
@@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4229 throttle_cfs_rq(cfs_rq); 4255 throttle_cfs_rq(cfs_rq);
4230} 4256}
4231 4257
4258static void sync_throttle(struct task_group *tg, int cpu)
4259{
4260 struct cfs_rq *pcfs_rq, *cfs_rq;
4261
4262 if (!cfs_bandwidth_used())
4263 return;
4264
4265 if (!tg->parent)
4266 return;
4267
4268 cfs_rq = tg->cfs_rq[cpu];
4269 pcfs_rq = tg->parent->cfs_rq[cpu];
4270
4271 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4272 pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4273}
4274
4232/* conditionally throttle active cfs_rq's from put_prev_entity() */ 4275/* conditionally throttle active cfs_rq's from put_prev_entity() */
4233static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4276static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4234{ 4277{
@@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4368static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 4411static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4369static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } 4412static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4370static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 4413static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4414static inline void sync_throttle(struct task_group *tg, int cpu) {}
4371static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 4415static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4372 4416
4373static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 4417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4476 * 4520 *
4477 * note: in the case of encountering a throttled cfs_rq we will 4521 * note: in the case of encountering a throttled cfs_rq we will
4478 * post the final h_nr_running increment below. 4522 * post the final h_nr_running increment below.
4479 */ 4523 */
4480 if (cfs_rq_throttled(cfs_rq)) 4524 if (cfs_rq_throttled(cfs_rq))
4481 break; 4525 break;
4482 cfs_rq->h_nr_running++; 4526 cfs_rq->h_nr_running++;
@@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
8317{ 8361{
8318 struct cfs_rq *cfs_rq; 8362 struct cfs_rq *cfs_rq;
8319 struct sched_entity *se = &p->se, *curr; 8363 struct sched_entity *se = &p->se, *curr;
8320 int this_cpu = smp_processor_id();
8321 struct rq *rq = this_rq(); 8364 struct rq *rq = this_rq();
8322 unsigned long flags;
8323
8324 raw_spin_lock_irqsave(&rq->lock, flags);
8325 8365
8366 raw_spin_lock(&rq->lock);
8326 update_rq_clock(rq); 8367 update_rq_clock(rq);
8327 8368
8328 cfs_rq = task_cfs_rq(current); 8369 cfs_rq = task_cfs_rq(current);
8329 curr = cfs_rq->curr; 8370 curr = cfs_rq->curr;
8330 8371 if (curr) {
8331 /* 8372 update_curr(cfs_rq);
8332 * Not only the cpu but also the task_group of the parent might have
8333 * been changed after parent->se.parent,cfs_rq were copied to
8334 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8335 * of child point to valid ones.
8336 */
8337 rcu_read_lock();
8338 __set_task_cpu(p, this_cpu);
8339 rcu_read_unlock();
8340
8341 update_curr(cfs_rq);
8342
8343 if (curr)
8344 se->vruntime = curr->vruntime; 8373 se->vruntime = curr->vruntime;
8374 }
8345 place_entity(cfs_rq, se, 1); 8375 place_entity(cfs_rq, se, 1);
8346 8376
8347 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { 8377 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
8354 } 8384 }
8355 8385
8356 se->vruntime -= cfs_rq->min_vruntime; 8386 se->vruntime -= cfs_rq->min_vruntime;
8357 8387 raw_spin_unlock(&rq->lock);
8358 raw_spin_unlock_irqrestore(&rq->lock, flags);
8359} 8388}
8360 8389
8361/* 8390/*
@@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
8411{ 8440{
8412 struct sched_entity *se = &p->se; 8441 struct sched_entity *se = &p->se;
8413 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8414 8445
8415 if (!vruntime_normalized(p)) { 8446 if (!vruntime_normalized(p)) {
8416 /* 8447 /*
@@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
8422 } 8453 }
8423 8454
8424 /* Catch up with the cfs_rq and remove our load when we leave */ 8455 /* Catch up with the cfs_rq and remove our load when we leave */
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8425 detach_entity_load_avg(cfs_rq, se); 8457 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update)
8459 update_tg_load_avg(cfs_rq, false);
8426} 8460}
8427 8461
8428static void attach_task_cfs_rq(struct task_struct *p) 8462static void attach_task_cfs_rq(struct task_struct *p)
8429{ 8463{
8430 struct sched_entity *se = &p->se; 8464 struct sched_entity *se = &p->se;
8431 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8465 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8432 8468
8433#ifdef CONFIG_FAIR_GROUP_SCHED 8469#ifdef CONFIG_FAIR_GROUP_SCHED
8434 /* 8470 /*
@@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
8439#endif 8475#endif
8440 8476
8441 /* Synchronize task with its cfs_rq */ 8477 /* Synchronize task with its cfs_rq */
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8442 attach_entity_load_avg(cfs_rq, se); 8479 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update)
8481 update_tg_load_avg(cfs_rq, false);
8443 8482
8444 if (!vruntime_normalized(p)) 8483 if (!vruntime_normalized(p))
8445 se->vruntime += cfs_rq->min_vruntime; 8484 se->vruntime += cfs_rq->min_vruntime;
@@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
8499} 8538}
8500 8539
8501#ifdef CONFIG_FAIR_GROUP_SCHED 8540#ifdef CONFIG_FAIR_GROUP_SCHED
8541static void task_set_group_fair(struct task_struct *p)
8542{
8543 struct sched_entity *se = &p->se;
8544
8545 set_task_rq(p, task_cpu(p));
8546 se->depth = se->parent ? se->parent->depth + 1 : 0;
8547}
8548
8502static void task_move_group_fair(struct task_struct *p) 8549static void task_move_group_fair(struct task_struct *p)
8503{ 8550{
8504 detach_task_cfs_rq(p); 8551 detach_task_cfs_rq(p);
@@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
8511 attach_task_cfs_rq(p); 8558 attach_task_cfs_rq(p);
8512} 8559}
8513 8560
8561static void task_change_group_fair(struct task_struct *p, int type)
8562{
8563 switch (type) {
8564 case TASK_SET_GROUP:
8565 task_set_group_fair(p);
8566 break;
8567
8568 case TASK_MOVE_GROUP:
8569 task_move_group_fair(p);
8570 break;
8571 }
8572}
8573
8514void free_fair_sched_group(struct task_group *tg) 8574void free_fair_sched_group(struct task_group *tg)
8515{ 8575{
8516 int i; 8576 int i;
@@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8562 init_cfs_rq(cfs_rq); 8622 init_cfs_rq(cfs_rq);
8563 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8623 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8564 init_entity_runnable_average(se); 8624 init_entity_runnable_average(se);
8565
8566 raw_spin_lock_irq(&rq->lock);
8567 post_init_entity_util_avg(se);
8568 raw_spin_unlock_irq(&rq->lock);
8569 } 8625 }
8570 8626
8571 return 1; 8627 return 1;
@@ -8576,6 +8632,23 @@ err:
8576 return 0; 8632 return 0;
8577} 8633}
8578 8634
8635void online_fair_sched_group(struct task_group *tg)
8636{
8637 struct sched_entity *se;
8638 struct rq *rq;
8639 int i;
8640
8641 for_each_possible_cpu(i) {
8642 rq = cpu_rq(i);
8643 se = tg->se[i];
8644
8645 raw_spin_lock_irq(&rq->lock);
8646 post_init_entity_util_avg(se);
8647 sync_throttle(tg, i);
8648 raw_spin_unlock_irq(&rq->lock);
8649 }
8650}
8651
8579void unregister_fair_sched_group(struct task_group *tg) 8652void unregister_fair_sched_group(struct task_group *tg)
8580{ 8653{
8581 unsigned long flags; 8654 unsigned long flags;
@@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8680 return 1; 8753 return 1;
8681} 8754}
8682 8755
8756void online_fair_sched_group(struct task_group *tg) { }
8757
8683void unregister_fair_sched_group(struct task_group *tg) { } 8758void unregister_fair_sched_group(struct task_group *tg) { }
8684 8759
8685#endif /* CONFIG_FAIR_GROUP_SCHED */ 8760#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = {
8739 .update_curr = update_curr_fair, 8814 .update_curr = update_curr_fair,
8740 8815
8741#ifdef CONFIG_FAIR_GROUP_SCHED 8816#ifdef CONFIG_FAIR_GROUP_SCHED
8742 .task_move_group = task_move_group_fair, 8817 .task_change_group = task_change_group_fair,
8743#endif 8818#endif
8744}; 8819};
8745 8820
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c5aeedf4e93a..9fb873cfc75c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -201,6 +201,8 @@ exit_idle:
201 */ 201 */
202static void cpu_idle_loop(void) 202static void cpu_idle_loop(void)
203{ 203{
204 int cpu = smp_processor_id();
205
204 while (1) { 206 while (1) {
205 /* 207 /*
206 * If the arch has a polling bit, we maintain an invariant: 208 * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
219 check_pgt_cache(); 221 check_pgt_cache();
220 rmb(); 222 rmb();
221 223
222 if (cpu_is_offline(smp_processor_id())) { 224 if (cpu_is_offline(cpu)) {
223 cpuhp_report_idle_dead(); 225 cpuhp_report_idle_dead();
224 arch_cpu_idle_dead(); 226 arch_cpu_idle_dead();
225 } 227 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 81283592942b..c64fc5114004 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
321 321
322extern void free_fair_sched_group(struct task_group *tg); 322extern void free_fair_sched_group(struct task_group *tg);
323extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 323extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
324extern void online_fair_sched_group(struct task_group *tg);
324extern void unregister_fair_sched_group(struct task_group *tg); 325extern void unregister_fair_sched_group(struct task_group *tg);
325extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 326extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
326 struct sched_entity *se, int cpu, 327 struct sched_entity *se, int cpu,
@@ -437,7 +438,7 @@ struct cfs_rq {
437 438
438 u64 throttled_clock, throttled_clock_task; 439 u64 throttled_clock, throttled_clock_task;
439 u64 throttled_clock_task_time; 440 u64 throttled_clock_task_time;
440 int throttled, throttle_count, throttle_uptodate; 441 int throttled, throttle_count;
441 struct list_head throttled_list; 442 struct list_head throttled_list;
442#endif /* CONFIG_CFS_BANDWIDTH */ 443#endif /* CONFIG_CFS_BANDWIDTH */
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 444#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1246,8 +1247,11 @@ struct sched_class {
1246 1247
1247 void (*update_curr) (struct rq *rq); 1248 void (*update_curr) (struct rq *rq);
1248 1249
1250#define TASK_SET_GROUP 0
1251#define TASK_MOVE_GROUP 1
1252
1249#ifdef CONFIG_FAIR_GROUP_SCHED 1253#ifdef CONFIG_FAIR_GROUP_SCHED
1250 void (*task_move_group) (struct task_struct *p); 1254 void (*task_change_group) (struct task_struct *p, int type);
1251#endif 1255#endif
1252}; 1256};
1253 1257
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
1809#else /* arch_scale_freq_capacity */ 1813#else /* arch_scale_freq_capacity */
1810#define arch_scale_freq_invariant() (false) 1814#define arch_scale_freq_invariant() (false)
1811#endif 1815#endif
1812
1813static inline void account_reset_rq(struct rq *rq)
1814{
1815#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1816 rq->prev_irq_time = 0;
1817#endif
1818#ifdef CONFIG_PARAVIRT
1819 rq->prev_steal_time = 0;
1820#endif
1821#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1822 rq->prev_steal_time_rq = 0;
1823#endif
1824}