diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-25 16:59:34 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-25 16:59:34 -0400 |
commit | cca08cd66ce6cc37812b6b36986ba7eaabd33e0b (patch) | |
tree | f68966cff4e888f51bd18497c358662c14cab6a2 | |
parent | 7e4dc77b2869a683fc43c0394fca5441816390ba (diff) | |
parent | 748c7201e622d1c24abb4f85072d2e74d12f295f (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- introduce and use task_rcu_dereference()/try_get_task_struct() to fix
and generalize task_struct handling (Oleg Nesterov)
- do various per entity load tracking (PELT) fixes and optimizations
(Peter Zijlstra)
- cputime virt-steal time accounting enhancements/fixes (Wanpeng Li)
- introduce consolidated cputime output file cpuacct.usage_all and
related refactorings (Zhao Lei)
- ... plus misc fixes and enhancements
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set
sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together
sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show()
sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums
sched/fair: Rework throttle_count sync
sched/core: Fix sched_getaffinity() return value kerneldoc comment
sched/fair: Reorder cgroup creation code
sched/fair: Apply more PELT fixes
sched/fair: Fix PELT integrity for new tasks
sched/cgroup: Fix cpu_cgroup_fork() handling
sched/fair: Fix PELT integrity for new groups
sched/fair: Fix and optimize the fork() path
sched/cputime: Add steal time support to full dynticks CPU time accounting
sched/cputime: Fix prev steal time accouting during CPU hotplug
KVM: Fix steal clock warp during guest CPU hotplug
sched/debug: Always show 'nr_migrations'
sched/fair: Use task_rcu_dereference()
sched/api: Introduce task_rcu_dereference() and try_get_task_struct()
sched/idle: Optimize the generic idle loop
sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task()
-rw-r--r-- | arch/x86/kernel/kvm.c | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | kernel/exit.c | 76 | ||||
-rw-r--r-- | kernel/sched/core.c | 114 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 114 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 16 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 251 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/sched.h | 21 |
10 files changed, 418 insertions, 190 deletions
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..1ef5e48b3a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) | |||
301 | if (!has_steal_clock) | 301 | if (!has_steal_clock) |
302 | return; | 302 | return; |
303 | 303 | ||
304 | memset(st, 0, sizeof(*st)); | ||
305 | |||
306 | wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); | 304 | wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); |
307 | pr_info("kvm-stealtime: cpu %d, msr %llx\n", | 305 | pr_info("kvm-stealtime: cpu %d, msr %llx\n", |
308 | cpu, (unsigned long long) slow_virt_to_phys(st)); | 306 | cpu, (unsigned long long) slow_virt_to_phys(st)); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 253538f29ade..d99218a1e043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); | |||
219 | #define TASK_WAKING 256 | 219 | #define TASK_WAKING 256 |
220 | #define TASK_PARKED 512 | 220 | #define TASK_PARKED 512 |
221 | #define TASK_NOLOAD 1024 | 221 | #define TASK_NOLOAD 1024 |
222 | #define TASK_STATE_MAX 2048 | 222 | #define TASK_NEW 2048 |
223 | #define TASK_STATE_MAX 4096 | ||
223 | 224 | ||
224 | #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" | 225 | #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" |
225 | 226 | ||
226 | extern char ___assert_task_state[1 - 2*!!( | 227 | extern char ___assert_task_state[1 - 2*!!( |
227 | sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; | 228 | sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; |
@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t) | |||
2139 | __put_task_struct(t); | 2140 | __put_task_struct(t); |
2140 | } | 2141 | } |
2141 | 2142 | ||
2143 | struct task_struct *task_rcu_dereference(struct task_struct **ptask); | ||
2144 | struct task_struct *try_get_task_struct(struct task_struct **ptask); | ||
2145 | |||
2142 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 2146 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
2143 | extern void task_cputime(struct task_struct *t, | 2147 | extern void task_cputime(struct task_struct *t, |
2144 | cputime_t *utime, cputime_t *stime); | 2148 | cputime_t *utime, cputime_t *stime); |
diff --git a/kernel/exit.c b/kernel/exit.c index 0b40791b9e70..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -211,6 +211,82 @@ repeat: | |||
211 | } | 211 | } |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * Note that if this function returns a valid task_struct pointer (!NULL) | ||
215 | * task->usage must remain >0 for the duration of the RCU critical section. | ||
216 | */ | ||
217 | struct task_struct *task_rcu_dereference(struct task_struct **ptask) | ||
218 | { | ||
219 | struct sighand_struct *sighand; | ||
220 | struct task_struct *task; | ||
221 | |||
222 | /* | ||
223 | * We need to verify that release_task() was not called and thus | ||
224 | * delayed_put_task_struct() can't run and drop the last reference | ||
225 | * before rcu_read_unlock(). We check task->sighand != NULL, | ||
226 | * but we can read the already freed and reused memory. | ||
227 | */ | ||
228 | retry: | ||
229 | task = rcu_dereference(*ptask); | ||
230 | if (!task) | ||
231 | return NULL; | ||
232 | |||
233 | probe_kernel_address(&task->sighand, sighand); | ||
234 | |||
235 | /* | ||
236 | * Pairs with atomic_dec_and_test() in put_task_struct(). If this task | ||
237 | * was already freed we can not miss the preceding update of this | ||
238 | * pointer. | ||
239 | */ | ||
240 | smp_rmb(); | ||
241 | if (unlikely(task != READ_ONCE(*ptask))) | ||
242 | goto retry; | ||
243 | |||
244 | /* | ||
245 | * We've re-checked that "task == *ptask", now we have two different | ||
246 | * cases: | ||
247 | * | ||
248 | * 1. This is actually the same task/task_struct. In this case | ||
249 | * sighand != NULL tells us it is still alive. | ||
250 | * | ||
251 | * 2. This is another task which got the same memory for task_struct. | ||
252 | * We can't know this of course, and we can not trust | ||
253 | * sighand != NULL. | ||
254 | * | ||
255 | * In this case we actually return a random value, but this is | ||
256 | * correct. | ||
257 | * | ||
258 | * If we return NULL - we can pretend that we actually noticed that | ||
259 | * *ptask was updated when the previous task has exited. Or pretend | ||
260 | * that probe_slab_address(&sighand) reads NULL. | ||
261 | * | ||
262 | * If we return the new task (because sighand is not NULL for any | ||
263 | * reason) - this is fine too. This (new) task can't go away before | ||
264 | * another gp pass. | ||
265 | * | ||
266 | * And note: We could even eliminate the false positive if re-read | ||
267 | * task->sighand once again to avoid the falsely NULL. But this case | ||
268 | * is very unlikely so we don't care. | ||
269 | */ | ||
270 | if (!sighand) | ||
271 | return NULL; | ||
272 | |||
273 | return task; | ||
274 | } | ||
275 | |||
276 | struct task_struct *try_get_task_struct(struct task_struct **ptask) | ||
277 | { | ||
278 | struct task_struct *task; | ||
279 | |||
280 | rcu_read_lock(); | ||
281 | task = task_rcu_dereference(ptask); | ||
282 | if (task) | ||
283 | get_task_struct(task); | ||
284 | rcu_read_unlock(); | ||
285 | |||
286 | return task; | ||
287 | } | ||
288 | |||
289 | /* | ||
214 | * Determine if a process group is "orphaned", according to the POSIX | 290 | * Determine if a process group is "orphaned", according to the POSIX |
215 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 291 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
216 | * by terminal-generated stop signals. Newly orphaned process groups are | 292 | * by terminal-generated stop signals. Newly orphaned process groups are |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index af0ef74df23c..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2342 | 2342 | ||
2343 | __sched_fork(clone_flags, p); | 2343 | __sched_fork(clone_flags, p); |
2344 | /* | 2344 | /* |
2345 | * We mark the process as running here. This guarantees that | 2345 | * We mark the process as NEW here. This guarantees that |
2346 | * nobody will actually run it, and a signal or other external | 2346 | * nobody will actually run it, and a signal or other external |
2347 | * event cannot wake it up and insert it on the runqueue either. | 2347 | * event cannot wake it up and insert it on the runqueue either. |
2348 | */ | 2348 | */ |
2349 | p->state = TASK_RUNNING; | 2349 | p->state = TASK_NEW; |
2350 | 2350 | ||
2351 | /* | 2351 | /* |
2352 | * Make sure we do not leak PI boosting priority to the child. | 2352 | * Make sure we do not leak PI boosting priority to the child. |
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2383 | p->sched_class = &fair_sched_class; | 2383 | p->sched_class = &fair_sched_class; |
2384 | } | 2384 | } |
2385 | 2385 | ||
2386 | if (p->sched_class->task_fork) | 2386 | init_entity_runnable_average(&p->se); |
2387 | p->sched_class->task_fork(p); | ||
2388 | 2387 | ||
2389 | /* | 2388 | /* |
2390 | * The child is not yet in the pid-hash so no cgroup attach races, | 2389 | * The child is not yet in the pid-hash so no cgroup attach races, |
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2394 | * Silence PROVE_RCU. | 2393 | * Silence PROVE_RCU. |
2395 | */ | 2394 | */ |
2396 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2395 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2397 | set_task_cpu(p, cpu); | 2396 | /* |
2397 | * We're setting the cpu for the first time, we don't migrate, | ||
2398 | * so use __set_task_cpu(). | ||
2399 | */ | ||
2400 | __set_task_cpu(p, cpu); | ||
2401 | if (p->sched_class->task_fork) | ||
2402 | p->sched_class->task_fork(p); | ||
2398 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2403 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2399 | 2404 | ||
2400 | #ifdef CONFIG_SCHED_INFO | 2405 | #ifdef CONFIG_SCHED_INFO |
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) | |||
2526 | struct rq_flags rf; | 2531 | struct rq_flags rf; |
2527 | struct rq *rq; | 2532 | struct rq *rq; |
2528 | 2533 | ||
2529 | /* Initialize new task's runnable average */ | ||
2530 | init_entity_runnable_average(&p->se); | ||
2531 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); | 2534 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); |
2535 | p->state = TASK_RUNNING; | ||
2532 | #ifdef CONFIG_SMP | 2536 | #ifdef CONFIG_SMP |
2533 | /* | 2537 | /* |
2534 | * Fork balancing, do it here and not earlier because: | 2538 | * Fork balancing, do it here and not earlier because: |
2535 | * - cpus_allowed can change in the fork path | 2539 | * - cpus_allowed can change in the fork path |
2536 | * - any previously selected cpu might disappear through hotplug | 2540 | * - any previously selected cpu might disappear through hotplug |
2541 | * | ||
2542 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | ||
2543 | * as we're not fully set-up yet. | ||
2537 | */ | 2544 | */ |
2538 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2545 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2539 | #endif | 2546 | #endif |
2540 | rq = __task_rq_lock(p, &rf); | 2547 | rq = __task_rq_lock(p, &rf); |
2541 | post_init_entity_util_avg(&p->se); | 2548 | post_init_entity_util_avg(&p->se); |
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3161 | pr_cont("\n"); | 3168 | pr_cont("\n"); |
3162 | } | 3169 | } |
3163 | #endif | 3170 | #endif |
3171 | if (panic_on_warn) | ||
3172 | panic("scheduling while atomic\n"); | ||
3173 | |||
3164 | dump_stack(); | 3174 | dump_stack(); |
3165 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | 3175 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
3166 | } | 3176 | } |
@@ -4752,7 +4762,8 @@ out_unlock: | |||
4752 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4762 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4753 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4763 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
4754 | * | 4764 | * |
4755 | * Return: 0 on success. An error code otherwise. | 4765 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
4766 | * error code otherwise. | ||
4756 | */ | 4767 | */ |
4757 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 4768 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
4758 | unsigned long __user *, user_mask_ptr) | 4769 | unsigned long __user *, user_mask_ptr) |
@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) | |||
7233 | struct rq *rq = cpu_rq(cpu); | 7244 | struct rq *rq = cpu_rq(cpu); |
7234 | 7245 | ||
7235 | rq->calc_load_update = calc_load_update; | 7246 | rq->calc_load_update = calc_load_update; |
7236 | account_reset_rq(rq); | ||
7237 | update_max_interval(); | 7247 | update_max_interval(); |
7238 | } | 7248 | } |
7239 | 7249 | ||
@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7713 | INIT_LIST_HEAD(&tg->children); | 7723 | INIT_LIST_HEAD(&tg->children); |
7714 | list_add_rcu(&tg->siblings, &parent->children); | 7724 | list_add_rcu(&tg->siblings, &parent->children); |
7715 | spin_unlock_irqrestore(&task_group_lock, flags); | 7725 | spin_unlock_irqrestore(&task_group_lock, flags); |
7726 | |||
7727 | online_fair_sched_group(tg); | ||
7716 | } | 7728 | } |
7717 | 7729 | ||
7718 | /* rcu callback to free various structures associated with a task group */ | 7730 | /* rcu callback to free various structures associated with a task group */ |
@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) | |||
7741 | spin_unlock_irqrestore(&task_group_lock, flags); | 7753 | spin_unlock_irqrestore(&task_group_lock, flags); |
7742 | } | 7754 | } |
7743 | 7755 | ||
7744 | /* change task's runqueue when it moves between groups. | 7756 | static void sched_change_group(struct task_struct *tsk, int type) |
7745 | * The caller of this function should have put the task in its new group | ||
7746 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
7747 | * reflect its new group. | ||
7748 | */ | ||
7749 | void sched_move_task(struct task_struct *tsk) | ||
7750 | { | 7757 | { |
7751 | struct task_group *tg; | 7758 | struct task_group *tg; |
7752 | int queued, running; | ||
7753 | struct rq_flags rf; | ||
7754 | struct rq *rq; | ||
7755 | |||
7756 | rq = task_rq_lock(tsk, &rf); | ||
7757 | |||
7758 | running = task_current(rq, tsk); | ||
7759 | queued = task_on_rq_queued(tsk); | ||
7760 | |||
7761 | if (queued) | ||
7762 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
7763 | if (unlikely(running)) | ||
7764 | put_prev_task(rq, tsk); | ||
7765 | 7759 | ||
7766 | /* | 7760 | /* |
7767 | * All callers are synchronized by task_rq_lock(); we do not use RCU | 7761 | * All callers are synchronized by task_rq_lock(); we do not use RCU |
@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) | |||
7774 | tsk->sched_task_group = tg; | 7768 | tsk->sched_task_group = tg; |
7775 | 7769 | ||
7776 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7770 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7777 | if (tsk->sched_class->task_move_group) | 7771 | if (tsk->sched_class->task_change_group) |
7778 | tsk->sched_class->task_move_group(tsk); | 7772 | tsk->sched_class->task_change_group(tsk, type); |
7779 | else | 7773 | else |
7780 | #endif | 7774 | #endif |
7781 | set_task_rq(tsk, task_cpu(tsk)); | 7775 | set_task_rq(tsk, task_cpu(tsk)); |
7776 | } | ||
7777 | |||
7778 | /* | ||
7779 | * Change task's runqueue when it moves between groups. | ||
7780 | * | ||
7781 | * The caller of this function should have put the task in its new group by | ||
7782 | * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect | ||
7783 | * its new group. | ||
7784 | */ | ||
7785 | void sched_move_task(struct task_struct *tsk) | ||
7786 | { | ||
7787 | int queued, running; | ||
7788 | struct rq_flags rf; | ||
7789 | struct rq *rq; | ||
7790 | |||
7791 | rq = task_rq_lock(tsk, &rf); | ||
7792 | |||
7793 | running = task_current(rq, tsk); | ||
7794 | queued = task_on_rq_queued(tsk); | ||
7795 | |||
7796 | if (queued) | ||
7797 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
7798 | if (unlikely(running)) | ||
7799 | put_prev_task(rq, tsk); | ||
7800 | |||
7801 | sched_change_group(tsk, TASK_MOVE_GROUP); | ||
7782 | 7802 | ||
7783 | if (unlikely(running)) | 7803 | if (unlikely(running)) |
7784 | tsk->sched_class->set_curr_task(rq); | 7804 | tsk->sched_class->set_curr_task(rq); |
@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) | |||
8206 | sched_free_group(tg); | 8226 | sched_free_group(tg); |
8207 | } | 8227 | } |
8208 | 8228 | ||
8229 | /* | ||
8230 | * This is called before wake_up_new_task(), therefore we really only | ||
8231 | * have to set its group bits, all the other stuff does not apply. | ||
8232 | */ | ||
8209 | static void cpu_cgroup_fork(struct task_struct *task) | 8233 | static void cpu_cgroup_fork(struct task_struct *task) |
8210 | { | 8234 | { |
8211 | sched_move_task(task); | 8235 | struct rq_flags rf; |
8236 | struct rq *rq; | ||
8237 | |||
8238 | rq = task_rq_lock(task, &rf); | ||
8239 | |||
8240 | sched_change_group(task, TASK_SET_GROUP); | ||
8241 | |||
8242 | task_rq_unlock(rq, task, &rf); | ||
8212 | } | 8243 | } |
8213 | 8244 | ||
8214 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | 8245 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) |
8215 | { | 8246 | { |
8216 | struct task_struct *task; | 8247 | struct task_struct *task; |
8217 | struct cgroup_subsys_state *css; | 8248 | struct cgroup_subsys_state *css; |
8249 | int ret = 0; | ||
8218 | 8250 | ||
8219 | cgroup_taskset_for_each(task, css, tset) { | 8251 | cgroup_taskset_for_each(task, css, tset) { |
8220 | #ifdef CONFIG_RT_GROUP_SCHED | 8252 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | |||
8225 | if (task->sched_class != &fair_sched_class) | 8257 | if (task->sched_class != &fair_sched_class) |
8226 | return -EINVAL; | 8258 | return -EINVAL; |
8227 | #endif | 8259 | #endif |
8260 | /* | ||
8261 | * Serialize against wake_up_new_task() such that if its | ||
8262 | * running, we're sure to observe its full state. | ||
8263 | */ | ||
8264 | raw_spin_lock_irq(&task->pi_lock); | ||
8265 | /* | ||
8266 | * Avoid calling sched_move_task() before wake_up_new_task() | ||
8267 | * has happened. This would lead to problems with PELT, due to | ||
8268 | * move wanting to detach+attach while we're not attached yet. | ||
8269 | */ | ||
8270 | if (task->state == TASK_NEW) | ||
8271 | ret = -EINVAL; | ||
8272 | raw_spin_unlock_irq(&task->pi_lock); | ||
8273 | |||
8274 | if (ret) | ||
8275 | break; | ||
8228 | } | 8276 | } |
8229 | return 0; | 8277 | return ret; |
8230 | } | 8278 | } |
8231 | 8279 | ||
8232 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) | 8280 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -25,15 +25,13 @@ enum cpuacct_stat_index { | |||
25 | CPUACCT_STAT_NSTATS, | 25 | CPUACCT_STAT_NSTATS, |
26 | }; | 26 | }; |
27 | 27 | ||
28 | enum cpuacct_usage_index { | 28 | static const char * const cpuacct_stat_desc[] = { |
29 | CPUACCT_USAGE_USER, /* ... user mode */ | 29 | [CPUACCT_STAT_USER] = "user", |
30 | CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ | 30 | [CPUACCT_STAT_SYSTEM] = "system", |
31 | |||
32 | CPUACCT_USAGE_NRUSAGE, | ||
33 | }; | 31 | }; |
34 | 32 | ||
35 | struct cpuacct_usage { | 33 | struct cpuacct_usage { |
36 | u64 usages[CPUACCT_USAGE_NRUSAGE]; | 34 | u64 usages[CPUACCT_STAT_NSTATS]; |
37 | }; | 35 | }; |
38 | 36 | ||
39 | /* track cpu usage of a group of tasks and its child groups */ | 37 | /* track cpu usage of a group of tasks and its child groups */ |
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) | |||
108 | } | 106 | } |
109 | 107 | ||
110 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | 108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, |
111 | enum cpuacct_usage_index index) | 109 | enum cpuacct_stat_index index) |
112 | { | 110 | { |
113 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 111 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
114 | u64 data; | 112 | u64 data; |
115 | 113 | ||
116 | /* | 114 | /* |
117 | * We allow index == CPUACCT_USAGE_NRUSAGE here to read | 115 | * We allow index == CPUACCT_STAT_NSTATS here to read |
118 | * the sum of suages. | 116 | * the sum of suages. |
119 | */ | 117 | */ |
120 | BUG_ON(index > CPUACCT_USAGE_NRUSAGE); | 118 | BUG_ON(index > CPUACCT_STAT_NSTATS); |
121 | 119 | ||
122 | #ifndef CONFIG_64BIT | 120 | #ifndef CONFIG_64BIT |
123 | /* | 121 | /* |
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | |||
126 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 124 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
127 | #endif | 125 | #endif |
128 | 126 | ||
129 | if (index == CPUACCT_USAGE_NRUSAGE) { | 127 | if (index == CPUACCT_STAT_NSTATS) { |
130 | int i = 0; | 128 | int i = 0; |
131 | 129 | ||
132 | data = 0; | 130 | data = 0; |
133 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 131 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
134 | data += cpuusage->usages[i]; | 132 | data += cpuusage->usages[i]; |
135 | } else { | 133 | } else { |
136 | data = cpuusage->usages[index]; | 134 | data = cpuusage->usages[index]; |
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
155 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 153 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
156 | #endif | 154 | #endif |
157 | 155 | ||
158 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 156 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
159 | cpuusage->usages[i] = val; | 157 | cpuusage->usages[i] = val; |
160 | 158 | ||
161 | #ifndef CONFIG_64BIT | 159 | #ifndef CONFIG_64BIT |
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
165 | 163 | ||
166 | /* return total cpu usage (in nanoseconds) of a group */ | 164 | /* return total cpu usage (in nanoseconds) of a group */ |
167 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 165 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
168 | enum cpuacct_usage_index index) | 166 | enum cpuacct_stat_index index) |
169 | { | 167 | { |
170 | struct cpuacct *ca = css_ca(css); | 168 | struct cpuacct *ca = css_ca(css); |
171 | u64 totalcpuusage = 0; | 169 | u64 totalcpuusage = 0; |
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, | |||
180 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, | 178 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, |
181 | struct cftype *cft) | 179 | struct cftype *cft) |
182 | { | 180 | { |
183 | return __cpuusage_read(css, CPUACCT_USAGE_USER); | 181 | return __cpuusage_read(css, CPUACCT_STAT_USER); |
184 | } | 182 | } |
185 | 183 | ||
186 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, | 184 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, |
187 | struct cftype *cft) | 185 | struct cftype *cft) |
188 | { | 186 | { |
189 | return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); | 187 | return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); |
190 | } | 188 | } |
191 | 189 | ||
192 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) | 190 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
193 | { | 191 | { |
194 | return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); | 192 | return __cpuusage_read(css, CPUACCT_STAT_NSTATS); |
195 | } | 193 | } |
196 | 194 | ||
197 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | 195 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | |||
213 | } | 211 | } |
214 | 212 | ||
215 | static int __cpuacct_percpu_seq_show(struct seq_file *m, | 213 | static int __cpuacct_percpu_seq_show(struct seq_file *m, |
216 | enum cpuacct_usage_index index) | 214 | enum cpuacct_stat_index index) |
217 | { | 215 | { |
218 | struct cpuacct *ca = css_ca(seq_css(m)); | 216 | struct cpuacct *ca = css_ca(seq_css(m)); |
219 | u64 percpu; | 217 | u64 percpu; |
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, | |||
229 | 227 | ||
230 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) | 228 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) |
231 | { | 229 | { |
232 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); | 230 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); |
233 | } | 231 | } |
234 | 232 | ||
235 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) | 233 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) |
236 | { | 234 | { |
237 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); | 235 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); |
238 | } | 236 | } |
239 | 237 | ||
240 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) | 238 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) |
241 | { | 239 | { |
242 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); | 240 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); |
243 | } | 241 | } |
244 | 242 | ||
245 | static const char * const cpuacct_stat_desc[] = { | 243 | static int cpuacct_all_seq_show(struct seq_file *m, void *V) |
246 | [CPUACCT_STAT_USER] = "user", | 244 | { |
247 | [CPUACCT_STAT_SYSTEM] = "system", | 245 | struct cpuacct *ca = css_ca(seq_css(m)); |
248 | }; | 246 | int index; |
247 | int cpu; | ||
248 | |||
249 | seq_puts(m, "cpu"); | ||
250 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) | ||
251 | seq_printf(m, " %s", cpuacct_stat_desc[index]); | ||
252 | seq_puts(m, "\n"); | ||
253 | |||
254 | for_each_possible_cpu(cpu) { | ||
255 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
256 | |||
257 | seq_printf(m, "%d", cpu); | ||
258 | |||
259 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { | ||
260 | #ifndef CONFIG_64BIT | ||
261 | /* | ||
262 | * Take rq->lock to make 64-bit read safe on 32-bit | ||
263 | * platforms. | ||
264 | */ | ||
265 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
266 | #endif | ||
267 | |||
268 | seq_printf(m, " %llu", cpuusage->usages[index]); | ||
269 | |||
270 | #ifndef CONFIG_64BIT | ||
271 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
272 | #endif | ||
273 | } | ||
274 | seq_puts(m, "\n"); | ||
275 | } | ||
276 | return 0; | ||
277 | } | ||
249 | 278 | ||
250 | static int cpuacct_stats_show(struct seq_file *sf, void *v) | 279 | static int cpuacct_stats_show(struct seq_file *sf, void *v) |
251 | { | 280 | { |
252 | struct cpuacct *ca = css_ca(seq_css(sf)); | 281 | struct cpuacct *ca = css_ca(seq_css(sf)); |
282 | s64 val[CPUACCT_STAT_NSTATS]; | ||
253 | int cpu; | 283 | int cpu; |
254 | s64 val = 0; | 284 | int stat; |
255 | 285 | ||
286 | memset(val, 0, sizeof(val)); | ||
256 | for_each_possible_cpu(cpu) { | 287 | for_each_possible_cpu(cpu) { |
257 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 288 | u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; |
258 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
259 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
260 | } | ||
261 | val = cputime64_to_clock_t(val); | ||
262 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
263 | 289 | ||
264 | val = 0; | 290 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; |
265 | for_each_possible_cpu(cpu) { | 291 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; |
266 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 292 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; |
267 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | 293 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; |
268 | val += kcpustat->cpustat[CPUTIME_IRQ]; | 294 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; |
269 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
270 | } | 295 | } |
271 | 296 | ||
272 | val = cputime64_to_clock_t(val); | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
273 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | 298 | seq_printf(sf, "%s %lld\n", |
299 | cpuacct_stat_desc[stat], | ||
300 | cputime64_to_clock_t(val[stat])); | ||
301 | } | ||
274 | 302 | ||
275 | return 0; | 303 | return 0; |
276 | } | 304 | } |
@@ -302,6 +330,10 @@ static struct cftype files[] = { | |||
302 | .seq_show = cpuacct_percpu_sys_seq_show, | 330 | .seq_show = cpuacct_percpu_sys_seq_show, |
303 | }, | 331 | }, |
304 | { | 332 | { |
333 | .name = "usage_all", | ||
334 | .seq_show = cpuacct_all_seq_show, | ||
335 | }, | ||
336 | { | ||
305 | .name = "stat", | 337 | .name = "stat", |
306 | .seq_show = cpuacct_stats_show, | 338 | .seq_show = cpuacct_stats_show, |
307 | }, | 339 | }, |
@@ -316,11 +348,11 @@ static struct cftype files[] = { | |||
316 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 348 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
317 | { | 349 | { |
318 | struct cpuacct *ca; | 350 | struct cpuacct *ca; |
319 | int index = CPUACCT_USAGE_SYSTEM; | 351 | int index = CPUACCT_STAT_SYSTEM; |
320 | struct pt_regs *regs = task_pt_regs(tsk); | 352 | struct pt_regs *regs = task_pt_regs(tsk); |
321 | 353 | ||
322 | if (regs && user_mode(regs)) | 354 | if (regs && user_mode(regs)) |
323 | index = CPUACCT_USAGE_USER; | 355 | index = CPUACCT_STAT_USER; |
324 | 356 | ||
325 | rcu_read_lock(); | 357 | rcu_read_lock(); |
326 | 358 | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..3d60e5d76fdb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) | |||
257 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 257 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
258 | } | 258 | } |
259 | 259 | ||
260 | static __always_inline bool steal_account_process_tick(void) | 260 | static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) |
261 | { | 261 | { |
262 | #ifdef CONFIG_PARAVIRT | 262 | #ifdef CONFIG_PARAVIRT |
263 | if (static_key_false(¶virt_steal_enabled)) { | 263 | if (static_key_false(¶virt_steal_enabled)) { |
@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) | |||
272 | * time in jiffies. Lets cast the result to jiffies | 272 | * time in jiffies. Lets cast the result to jiffies |
273 | * granularity and account the rest on the next rounds. | 273 | * granularity and account the rest on the next rounds. |
274 | */ | 274 | */ |
275 | steal_jiffies = nsecs_to_jiffies(steal); | 275 | steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); |
276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); | 276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); |
277 | 277 | ||
278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); | 278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); |
279 | return steal_jiffies; | 279 | return steal_jiffies; |
280 | } | 280 | } |
281 | #endif | 281 | #endif |
282 | return false; | 282 | return 0; |
283 | } | 283 | } |
284 | 284 | ||
285 | /* | 285 | /* |
@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
346 | u64 cputime = (__force u64) cputime_one_jiffy; | 346 | u64 cputime = (__force u64) cputime_one_jiffy; |
347 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 347 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
348 | 348 | ||
349 | if (steal_account_process_tick()) | 349 | if (steal_account_process_tick(ULONG_MAX)) |
350 | return; | 350 | return; |
351 | 351 | ||
352 | cputime *= ticks; | 352 | cputime *= ticks; |
@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
477 | return; | 477 | return; |
478 | } | 478 | } |
479 | 479 | ||
480 | if (steal_account_process_tick()) | 480 | if (steal_account_process_tick(ULONG_MAX)) |
481 | return; | 481 | return; |
482 | 482 | ||
483 | if (user_tick) | 483 | if (user_tick) |
@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) | |||
681 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 681 | static cputime_t get_vtime_delta(struct task_struct *tsk) |
682 | { | 682 | { |
683 | unsigned long now = READ_ONCE(jiffies); | 683 | unsigned long now = READ_ONCE(jiffies); |
684 | unsigned long delta = now - tsk->vtime_snap; | 684 | unsigned long delta_jiffies, steal_jiffies; |
685 | 685 | ||
686 | delta_jiffies = now - tsk->vtime_snap; | ||
687 | steal_jiffies = steal_account_process_tick(delta_jiffies); | ||
686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 688 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
687 | tsk->vtime_snap = now; | 689 | tsk->vtime_snap = now; |
688 | 690 | ||
689 | return jiffies_to_cputime(delta); | 691 | return jiffies_to_cputime(delta_jiffies - steal_jiffies); |
690 | } | 692 | } |
691 | 693 | ||
692 | static void __vtime_account_system(struct task_struct *tsk) | 694 | static void __vtime_account_system(struct task_struct *tsk) |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
879 | 879 | ||
880 | nr_switches = p->nvcsw + p->nivcsw; | 880 | nr_switches = p->nvcsw + p->nivcsw; |
881 | 881 | ||
882 | #ifdef CONFIG_SCHEDSTATS | ||
883 | P(se.nr_migrations); | 882 | P(se.nr_migrations); |
884 | 883 | ||
884 | #ifdef CONFIG_SCHEDSTATS | ||
885 | if (schedstat_enabled()) { | 885 | if (schedstat_enabled()) { |
886 | u64 avg_atom, avg_per_cpu; | 886 | u64 avg_atom, avg_per_cpu; |
887 | 887 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8c5d2d48424..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
691 | } | 691 | } |
692 | 692 | ||
693 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
694 | static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); | ||
695 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); | ||
696 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||
697 | |||
693 | /* | 698 | /* |
694 | * With new tasks being created, their initial util_avgs are extrapolated | 699 | * With new tasks being created, their initial util_avgs are extrapolated |
695 | * based on the cfs_rq's current util_avg: | 700 | * based on the cfs_rq's current util_avg: |
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
720 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 725 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
721 | struct sched_avg *sa = &se->avg; | 726 | struct sched_avg *sa = &se->avg; |
722 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
728 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
729 | int tg_update; | ||
723 | 730 | ||
724 | if (cap > 0) { | 731 | if (cap > 0) { |
725 | if (cfs_rq->avg.util_avg != 0) { | 732 | if (cfs_rq->avg.util_avg != 0) { |
@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
733 | } | 740 | } |
734 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; | 741 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
735 | } | 742 | } |
743 | |||
744 | if (entity_is_task(se)) { | ||
745 | struct task_struct *p = task_of(se); | ||
746 | if (p->sched_class != &fair_sched_class) { | ||
747 | /* | ||
748 | * For !fair tasks do: | ||
749 | * | ||
750 | update_cfs_rq_load_avg(now, cfs_rq, false); | ||
751 | attach_entity_load_avg(cfs_rq, se); | ||
752 | switched_from_fair(rq, p); | ||
753 | * | ||
754 | * such that the next switched_to_fair() has the | ||
755 | * expected state. | ||
756 | */ | ||
757 | se->avg.last_update_time = now; | ||
758 | return; | ||
759 | } | ||
760 | } | ||
761 | |||
762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
763 | attach_entity_load_avg(cfs_rq, se); | ||
764 | if (tg_update) | ||
765 | update_tg_load_avg(cfs_rq, false); | ||
736 | } | 766 | } |
737 | 767 | ||
738 | #else | 768 | #else /* !CONFIG_SMP */ |
739 | void init_entity_runnable_average(struct sched_entity *se) | 769 | void init_entity_runnable_average(struct sched_entity *se) |
740 | { | 770 | { |
741 | } | 771 | } |
742 | void post_init_entity_util_avg(struct sched_entity *se) | 772 | void post_init_entity_util_avg(struct sched_entity *se) |
743 | { | 773 | { |
744 | } | 774 | } |
745 | #endif | 775 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
776 | { | ||
777 | } | ||
778 | #endif /* CONFIG_SMP */ | ||
746 | 779 | ||
747 | /* | 780 | /* |
748 | * Update the current task's runtime statistics. | 781 | * Update the current task's runtime statistics. |
@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1303 | { | 1336 | { |
1304 | if (env->best_task) | 1337 | if (env->best_task) |
1305 | put_task_struct(env->best_task); | 1338 | put_task_struct(env->best_task); |
1339 | if (p) | ||
1340 | get_task_struct(p); | ||
1306 | 1341 | ||
1307 | env->best_task = p; | 1342 | env->best_task = p; |
1308 | env->best_imp = imp; | 1343 | env->best_imp = imp; |
@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1370 | long imp = env->p->numa_group ? groupimp : taskimp; | 1405 | long imp = env->p->numa_group ? groupimp : taskimp; |
1371 | long moveimp = imp; | 1406 | long moveimp = imp; |
1372 | int dist = env->dist; | 1407 | int dist = env->dist; |
1373 | bool assigned = false; | ||
1374 | 1408 | ||
1375 | rcu_read_lock(); | 1409 | rcu_read_lock(); |
1376 | 1410 | cur = task_rcu_dereference(&dst_rq->curr); | |
1377 | raw_spin_lock_irq(&dst_rq->lock); | 1411 | if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) |
1378 | cur = dst_rq->curr; | ||
1379 | /* | ||
1380 | * No need to move the exiting task or idle task. | ||
1381 | */ | ||
1382 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
1383 | cur = NULL; | 1412 | cur = NULL; |
1384 | else { | ||
1385 | /* | ||
1386 | * The task_struct must be protected here to protect the | ||
1387 | * p->numa_faults access in the task_weight since the | ||
1388 | * numa_faults could already be freed in the following path: | ||
1389 | * finish_task_switch() | ||
1390 | * --> put_task_struct() | ||
1391 | * --> __put_task_struct() | ||
1392 | * --> task_numa_free() | ||
1393 | */ | ||
1394 | get_task_struct(cur); | ||
1395 | } | ||
1396 | |||
1397 | raw_spin_unlock_irq(&dst_rq->lock); | ||
1398 | 1413 | ||
1399 | /* | 1414 | /* |
1400 | * Because we have preemption enabled we can get migrated around and | 1415 | * Because we have preemption enabled we can get migrated around and |
@@ -1477,7 +1492,6 @@ balance: | |||
1477 | */ | 1492 | */ |
1478 | if (!load_too_imbalanced(src_load, dst_load, env)) { | 1493 | if (!load_too_imbalanced(src_load, dst_load, env)) { |
1479 | imp = moveimp - 1; | 1494 | imp = moveimp - 1; |
1480 | put_task_struct(cur); | ||
1481 | cur = NULL; | 1495 | cur = NULL; |
1482 | goto assign; | 1496 | goto assign; |
1483 | } | 1497 | } |
@@ -1503,16 +1517,9 @@ balance: | |||
1503 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); |
1504 | 1518 | ||
1505 | assign: | 1519 | assign: |
1506 | assigned = true; | ||
1507 | task_numa_assign(env, cur, imp); | 1520 | task_numa_assign(env, cur, imp); |
1508 | unlock: | 1521 | unlock: |
1509 | rcu_read_unlock(); | 1522 | rcu_read_unlock(); |
1510 | /* | ||
1511 | * The dst_rq->curr isn't assigned. The protection for task_struct is | ||
1512 | * finished. | ||
1513 | */ | ||
1514 | if (cur && !assigned) | ||
1515 | put_task_struct(cur); | ||
1516 | } | 1523 | } |
1517 | 1524 | ||
1518 | static void task_numa_find_cpu(struct task_numa_env *env, | 1525 | static void task_numa_find_cpu(struct task_numa_env *env, |
@@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, | |||
2866 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 2873 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
2867 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2874 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2868 | 2875 | ||
2869 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
2870 | |||
2871 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
2872 | { | 2877 | { |
2873 | struct rq *rq = rq_of(cfs_rq); | 2878 | struct rq *rq = rq_of(cfs_rq); |
@@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2914 | WRITE_ONCE(*ptr, res); \ | 2919 | WRITE_ONCE(*ptr, res); \ |
2915 | } while (0) | 2920 | } while (0) |
2916 | 2921 | ||
2917 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | 2922 | /** |
2923 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages | ||
2924 | * @now: current time, as per cfs_rq_clock_task() | ||
2925 | * @cfs_rq: cfs_rq to update | ||
2926 | * @update_freq: should we call cfs_rq_util_change() or will the call do so | ||
2927 | * | ||
2928 | * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) | ||
2929 | * avg. The immediate corollary is that all (fair) tasks must be attached, see | ||
2930 | * post_init_entity_util_avg(). | ||
2931 | * | ||
2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | ||
2933 | * | ||
2934 | * Returns true if the load decayed or we removed utilization. It is expected | ||
2935 | * that one calls update_tg_load_avg() on this condition, but after you've | ||
2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | ||
2937 | * avg up. | ||
2938 | */ | ||
2918 | static inline int | 2939 | static inline int |
2919 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
2920 | { | 2941 | { |
@@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) | |||
2969 | update_tg_load_avg(cfs_rq, 0); | 2990 | update_tg_load_avg(cfs_rq, 0); |
2970 | } | 2991 | } |
2971 | 2992 | ||
2993 | /** | ||
2994 | * attach_entity_load_avg - attach this entity to its cfs_rq load avg | ||
2995 | * @cfs_rq: cfs_rq to attach to | ||
2996 | * @se: sched_entity to attach | ||
2997 | * | ||
2998 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
2999 | * cfs_rq->avg.last_update_time being current. | ||
3000 | */ | ||
2972 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3001 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2973 | { | 3002 | { |
2974 | if (!sched_feat(ATTACH_AGE_LOAD)) | 3003 | if (!sched_feat(ATTACH_AGE_LOAD)) |
@@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
2977 | /* | 3006 | /* |
2978 | * If we got migrated (either between CPUs or between cgroups) we'll | 3007 | * If we got migrated (either between CPUs or between cgroups) we'll |
2979 | * have aged the average right before clearing @last_update_time. | 3008 | * have aged the average right before clearing @last_update_time. |
3009 | * | ||
3010 | * Or we're fresh through post_init_entity_util_avg(). | ||
2980 | */ | 3011 | */ |
2981 | if (se->avg.last_update_time) { | 3012 | if (se->avg.last_update_time) { |
2982 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3013 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
@@ -2998,6 +3029,14 @@ skip_aging: | |||
2998 | cfs_rq_util_change(cfs_rq); | 3029 | cfs_rq_util_change(cfs_rq); |
2999 | } | 3030 | } |
3000 | 3031 | ||
3032 | /** | ||
3033 | * detach_entity_load_avg - detach this entity from its cfs_rq load avg | ||
3034 | * @cfs_rq: cfs_rq to detach from | ||
3035 | * @se: sched_entity to detach | ||
3036 | * | ||
3037 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
3038 | * cfs_rq->avg.last_update_time being current. | ||
3039 | */ | ||
3001 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3040 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3002 | { | 3041 | { |
3003 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3042 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
@@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
3082 | u64 last_update_time; | 3121 | u64 last_update_time; |
3083 | 3122 | ||
3084 | /* | 3123 | /* |
3085 | * Newly created task or never used group entity should not be removed | 3124 | * tasks cannot exit without having gone through wake_up_new_task() -> |
3086 | * from its (source) cfs_rq | 3125 | * post_init_entity_util_avg() which will have added things to the |
3126 | * cfs_rq, so we can remove unconditionally. | ||
3127 | * | ||
3128 | * Similarly for groups, they will have passed through | ||
3129 | * post_init_entity_util_avg() before unregister_sched_fair_group() | ||
3130 | * calls this. | ||
3087 | */ | 3131 | */ |
3088 | if (se->avg.last_update_time == 0) | ||
3089 | return; | ||
3090 | 3132 | ||
3091 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3133 | last_update_time = cfs_rq_last_update_time(cfs_rq); |
3092 | 3134 | ||
@@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); | |||
3109 | 3151 | ||
3110 | #else /* CONFIG_SMP */ | 3152 | #else /* CONFIG_SMP */ |
3111 | 3153 | ||
3154 | static inline int | ||
3155 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | ||
3156 | { | ||
3157 | return 0; | ||
3158 | } | ||
3159 | |||
3112 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
3113 | { | 3161 | { |
3114 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
@@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
3698 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 3746 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
3699 | { | 3747 | { |
3700 | if (unlikely(cfs_rq->throttle_count)) | 3748 | if (unlikely(cfs_rq->throttle_count)) |
3701 | return cfs_rq->throttled_clock_task; | 3749 | return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; |
3702 | 3750 | ||
3703 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; | 3751 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; |
3704 | } | 3752 | } |
@@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
3836 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 3884 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
3837 | 3885 | ||
3838 | cfs_rq->throttle_count--; | 3886 | cfs_rq->throttle_count--; |
3839 | #ifdef CONFIG_SMP | ||
3840 | if (!cfs_rq->throttle_count) { | 3887 | if (!cfs_rq->throttle_count) { |
3841 | /* adjust cfs_rq_clock_task() */ | 3888 | /* adjust cfs_rq_clock_task() */ |
3842 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - | 3889 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
3843 | cfs_rq->throttled_clock_task; | 3890 | cfs_rq->throttled_clock_task; |
3844 | } | 3891 | } |
3845 | #endif | ||
3846 | 3892 | ||
3847 | return 0; | 3893 | return 0; |
3848 | } | 3894 | } |
@@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
4195 | if (!cfs_bandwidth_used()) | 4241 | if (!cfs_bandwidth_used()) |
4196 | return; | 4242 | return; |
4197 | 4243 | ||
4198 | /* Synchronize hierarchical throttle counter: */ | ||
4199 | if (unlikely(!cfs_rq->throttle_uptodate)) { | ||
4200 | struct rq *rq = rq_of(cfs_rq); | ||
4201 | struct cfs_rq *pcfs_rq; | ||
4202 | struct task_group *tg; | ||
4203 | |||
4204 | cfs_rq->throttle_uptodate = 1; | ||
4205 | |||
4206 | /* Get closest up-to-date node, because leaves go first: */ | ||
4207 | for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { | ||
4208 | pcfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
4209 | if (pcfs_rq->throttle_uptodate) | ||
4210 | break; | ||
4211 | } | ||
4212 | if (tg) { | ||
4213 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
4214 | cfs_rq->throttled_clock_task = rq_clock_task(rq); | ||
4215 | } | ||
4216 | } | ||
4217 | |||
4218 | /* an active group must be handled by the update_curr()->put() path */ | 4244 | /* an active group must be handled by the update_curr()->put() path */ |
4219 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 4245 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
4220 | return; | 4246 | return; |
@@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
4229 | throttle_cfs_rq(cfs_rq); | 4255 | throttle_cfs_rq(cfs_rq); |
4230 | } | 4256 | } |
4231 | 4257 | ||
4258 | static void sync_throttle(struct task_group *tg, int cpu) | ||
4259 | { | ||
4260 | struct cfs_rq *pcfs_rq, *cfs_rq; | ||
4261 | |||
4262 | if (!cfs_bandwidth_used()) | ||
4263 | return; | ||
4264 | |||
4265 | if (!tg->parent) | ||
4266 | return; | ||
4267 | |||
4268 | cfs_rq = tg->cfs_rq[cpu]; | ||
4269 | pcfs_rq = tg->parent->cfs_rq[cpu]; | ||
4270 | |||
4271 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
4272 | pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); | ||
4273 | } | ||
4274 | |||
4232 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 4275 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
4233 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4276 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
4234 | { | 4277 | { |
@@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
4368 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 4411 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
4369 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } | 4412 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
4370 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 4413 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
4414 | static inline void sync_throttle(struct task_group *tg, int cpu) {} | ||
4371 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 4415 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
4372 | 4416 | ||
4373 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 4417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
@@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4476 | * | 4520 | * |
4477 | * note: in the case of encountering a throttled cfs_rq we will | 4521 | * note: in the case of encountering a throttled cfs_rq we will |
4478 | * post the final h_nr_running increment below. | 4522 | * post the final h_nr_running increment below. |
4479 | */ | 4523 | */ |
4480 | if (cfs_rq_throttled(cfs_rq)) | 4524 | if (cfs_rq_throttled(cfs_rq)) |
4481 | break; | 4525 | break; |
4482 | cfs_rq->h_nr_running++; | 4526 | cfs_rq->h_nr_running++; |
@@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) | |||
8317 | { | 8361 | { |
8318 | struct cfs_rq *cfs_rq; | 8362 | struct cfs_rq *cfs_rq; |
8319 | struct sched_entity *se = &p->se, *curr; | 8363 | struct sched_entity *se = &p->se, *curr; |
8320 | int this_cpu = smp_processor_id(); | ||
8321 | struct rq *rq = this_rq(); | 8364 | struct rq *rq = this_rq(); |
8322 | unsigned long flags; | ||
8323 | |||
8324 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8325 | 8365 | ||
8366 | raw_spin_lock(&rq->lock); | ||
8326 | update_rq_clock(rq); | 8367 | update_rq_clock(rq); |
8327 | 8368 | ||
8328 | cfs_rq = task_cfs_rq(current); | 8369 | cfs_rq = task_cfs_rq(current); |
8329 | curr = cfs_rq->curr; | 8370 | curr = cfs_rq->curr; |
8330 | 8371 | if (curr) { | |
8331 | /* | 8372 | update_curr(cfs_rq); |
8332 | * Not only the cpu but also the task_group of the parent might have | ||
8333 | * been changed after parent->se.parent,cfs_rq were copied to | ||
8334 | * child->se.parent,cfs_rq. So call __set_task_cpu() to make those | ||
8335 | * of child point to valid ones. | ||
8336 | */ | ||
8337 | rcu_read_lock(); | ||
8338 | __set_task_cpu(p, this_cpu); | ||
8339 | rcu_read_unlock(); | ||
8340 | |||
8341 | update_curr(cfs_rq); | ||
8342 | |||
8343 | if (curr) | ||
8344 | se->vruntime = curr->vruntime; | 8373 | se->vruntime = curr->vruntime; |
8374 | } | ||
8345 | place_entity(cfs_rq, se, 1); | 8375 | place_entity(cfs_rq, se, 1); |
8346 | 8376 | ||
8347 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { | 8377 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { |
@@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) | |||
8354 | } | 8384 | } |
8355 | 8385 | ||
8356 | se->vruntime -= cfs_rq->min_vruntime; | 8386 | se->vruntime -= cfs_rq->min_vruntime; |
8357 | 8387 | raw_spin_unlock(&rq->lock); | |
8358 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8359 | } | 8388 | } |
8360 | 8389 | ||
8361 | /* | 8390 | /* |
@@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8411 | { | 8440 | { |
8412 | struct sched_entity *se = &p->se; | 8441 | struct sched_entity *se = &p->se; |
8413 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8443 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8444 | int tg_update; | ||
8414 | 8445 | ||
8415 | if (!vruntime_normalized(p)) { | 8446 | if (!vruntime_normalized(p)) { |
8416 | /* | 8447 | /* |
@@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8422 | } | 8453 | } |
8423 | 8454 | ||
8424 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8455 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
8425 | detach_entity_load_avg(cfs_rq, se); | 8457 | detach_entity_load_avg(cfs_rq, se); |
8458 | if (tg_update) | ||
8459 | update_tg_load_avg(cfs_rq, false); | ||
8426 | } | 8460 | } |
8427 | 8461 | ||
8428 | static void attach_task_cfs_rq(struct task_struct *p) | 8462 | static void attach_task_cfs_rq(struct task_struct *p) |
8429 | { | 8463 | { |
8430 | struct sched_entity *se = &p->se; | 8464 | struct sched_entity *se = &p->se; |
8431 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8466 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8467 | int tg_update; | ||
8432 | 8468 | ||
8433 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8469 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8434 | /* | 8470 | /* |
@@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8439 | #endif | 8475 | #endif |
8440 | 8476 | ||
8441 | /* Synchronize task with its cfs_rq */ | 8477 | /* Synchronize task with its cfs_rq */ |
8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
8442 | attach_entity_load_avg(cfs_rq, se); | 8479 | attach_entity_load_avg(cfs_rq, se); |
8480 | if (tg_update) | ||
8481 | update_tg_load_avg(cfs_rq, false); | ||
8443 | 8482 | ||
8444 | if (!vruntime_normalized(p)) | 8483 | if (!vruntime_normalized(p)) |
8445 | se->vruntime += cfs_rq->min_vruntime; | 8484 | se->vruntime += cfs_rq->min_vruntime; |
@@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
8499 | } | 8538 | } |
8500 | 8539 | ||
8501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8541 | static void task_set_group_fair(struct task_struct *p) | ||
8542 | { | ||
8543 | struct sched_entity *se = &p->se; | ||
8544 | |||
8545 | set_task_rq(p, task_cpu(p)); | ||
8546 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
8547 | } | ||
8548 | |||
8502 | static void task_move_group_fair(struct task_struct *p) | 8549 | static void task_move_group_fair(struct task_struct *p) |
8503 | { | 8550 | { |
8504 | detach_task_cfs_rq(p); | 8551 | detach_task_cfs_rq(p); |
@@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) | |||
8511 | attach_task_cfs_rq(p); | 8558 | attach_task_cfs_rq(p); |
8512 | } | 8559 | } |
8513 | 8560 | ||
8561 | static void task_change_group_fair(struct task_struct *p, int type) | ||
8562 | { | ||
8563 | switch (type) { | ||
8564 | case TASK_SET_GROUP: | ||
8565 | task_set_group_fair(p); | ||
8566 | break; | ||
8567 | |||
8568 | case TASK_MOVE_GROUP: | ||
8569 | task_move_group_fair(p); | ||
8570 | break; | ||
8571 | } | ||
8572 | } | ||
8573 | |||
8514 | void free_fair_sched_group(struct task_group *tg) | 8574 | void free_fair_sched_group(struct task_group *tg) |
8515 | { | 8575 | { |
8516 | int i; | 8576 | int i; |
@@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8562 | init_cfs_rq(cfs_rq); | 8622 | init_cfs_rq(cfs_rq); |
8563 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8623 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8564 | init_entity_runnable_average(se); | 8624 | init_entity_runnable_average(se); |
8565 | |||
8566 | raw_spin_lock_irq(&rq->lock); | ||
8567 | post_init_entity_util_avg(se); | ||
8568 | raw_spin_unlock_irq(&rq->lock); | ||
8569 | } | 8625 | } |
8570 | 8626 | ||
8571 | return 1; | 8627 | return 1; |
@@ -8576,6 +8632,23 @@ err: | |||
8576 | return 0; | 8632 | return 0; |
8577 | } | 8633 | } |
8578 | 8634 | ||
8635 | void online_fair_sched_group(struct task_group *tg) | ||
8636 | { | ||
8637 | struct sched_entity *se; | ||
8638 | struct rq *rq; | ||
8639 | int i; | ||
8640 | |||
8641 | for_each_possible_cpu(i) { | ||
8642 | rq = cpu_rq(i); | ||
8643 | se = tg->se[i]; | ||
8644 | |||
8645 | raw_spin_lock_irq(&rq->lock); | ||
8646 | post_init_entity_util_avg(se); | ||
8647 | sync_throttle(tg, i); | ||
8648 | raw_spin_unlock_irq(&rq->lock); | ||
8649 | } | ||
8650 | } | ||
8651 | |||
8579 | void unregister_fair_sched_group(struct task_group *tg) | 8652 | void unregister_fair_sched_group(struct task_group *tg) |
8580 | { | 8653 | { |
8581 | unsigned long flags; | 8654 | unsigned long flags; |
@@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8680 | return 1; | 8753 | return 1; |
8681 | } | 8754 | } |
8682 | 8755 | ||
8756 | void online_fair_sched_group(struct task_group *tg) { } | ||
8757 | |||
8683 | void unregister_fair_sched_group(struct task_group *tg) { } | 8758 | void unregister_fair_sched_group(struct task_group *tg) { } |
8684 | 8759 | ||
8685 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8760 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = { | |||
8739 | .update_curr = update_curr_fair, | 8814 | .update_curr = update_curr_fair, |
8740 | 8815 | ||
8741 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8816 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8742 | .task_move_group = task_move_group_fair, | 8817 | .task_change_group = task_change_group_fair, |
8743 | #endif | 8818 | #endif |
8744 | }; | 8819 | }; |
8745 | 8820 | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c5aeedf4e93a..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -201,6 +201,8 @@ exit_idle: | |||
201 | */ | 201 | */ |
202 | static void cpu_idle_loop(void) | 202 | static void cpu_idle_loop(void) |
203 | { | 203 | { |
204 | int cpu = smp_processor_id(); | ||
205 | |||
204 | while (1) { | 206 | while (1) { |
205 | /* | 207 | /* |
206 | * If the arch has a polling bit, we maintain an invariant: | 208 | * If the arch has a polling bit, we maintain an invariant: |
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void) | |||
219 | check_pgt_cache(); | 221 | check_pgt_cache(); |
220 | rmb(); | 222 | rmb(); |
221 | 223 | ||
222 | if (cpu_is_offline(smp_processor_id())) { | 224 | if (cpu_is_offline(cpu)) { |
223 | cpuhp_report_idle_dead(); | 225 | cpuhp_report_idle_dead(); |
224 | arch_cpu_idle_dead(); | 226 | arch_cpu_idle_dead(); |
225 | } | 227 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 81283592942b..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); | |||
321 | 321 | ||
322 | extern void free_fair_sched_group(struct task_group *tg); | 322 | extern void free_fair_sched_group(struct task_group *tg); |
323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | 323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); |
324 | extern void online_fair_sched_group(struct task_group *tg); | ||
324 | extern void unregister_fair_sched_group(struct task_group *tg); | 325 | extern void unregister_fair_sched_group(struct task_group *tg); |
325 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 326 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
326 | struct sched_entity *se, int cpu, | 327 | struct sched_entity *se, int cpu, |
@@ -437,7 +438,7 @@ struct cfs_rq { | |||
437 | 438 | ||
438 | u64 throttled_clock, throttled_clock_task; | 439 | u64 throttled_clock, throttled_clock_task; |
439 | u64 throttled_clock_task_time; | 440 | u64 throttled_clock_task_time; |
440 | int throttled, throttle_count, throttle_uptodate; | 441 | int throttled, throttle_count; |
441 | struct list_head throttled_list; | 442 | struct list_head throttled_list; |
442 | #endif /* CONFIG_CFS_BANDWIDTH */ | 443 | #endif /* CONFIG_CFS_BANDWIDTH */ |
443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 444 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -1246,8 +1247,11 @@ struct sched_class { | |||
1246 | 1247 | ||
1247 | void (*update_curr) (struct rq *rq); | 1248 | void (*update_curr) (struct rq *rq); |
1248 | 1249 | ||
1250 | #define TASK_SET_GROUP 0 | ||
1251 | #define TASK_MOVE_GROUP 1 | ||
1252 | |||
1249 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1253 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1250 | void (*task_move_group) (struct task_struct *p); | 1254 | void (*task_change_group) (struct task_struct *p, int type); |
1251 | #endif | 1255 | #endif |
1252 | }; | 1256 | }; |
1253 | 1257 | ||
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} | |||
1809 | #else /* arch_scale_freq_capacity */ | 1813 | #else /* arch_scale_freq_capacity */ |
1810 | #define arch_scale_freq_invariant() (false) | 1814 | #define arch_scale_freq_invariant() (false) |
1811 | #endif | 1815 | #endif |
1812 | |||
1813 | static inline void account_reset_rq(struct rq *rq) | ||
1814 | { | ||
1815 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1816 | rq->prev_irq_time = 0; | ||
1817 | #endif | ||
1818 | #ifdef CONFIG_PARAVIRT | ||
1819 | rq->prev_steal_time = 0; | ||
1820 | #endif | ||
1821 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
1822 | rq->prev_steal_time_rq = 0; | ||
1823 | #endif | ||
1824 | } | ||