aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:21:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:21:38 -0500
commitf57d54bab696133fae569c5f01352249c36fc74f (patch)
tree8ebe3c6deaf95c424c86843c3d290fbf2a9e80d2 /kernel/sched
parentda830e589a45f0c42eef6f3cbd07275f8893f181 (diff)
parentc1ad41f1f7270c1956da13fa8fd59d8d5929d56e (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The biggest change affects group scheduling: we now track the runnable average on a per-task entity basis, allowing a smoother, exponential decay average based load/weight estimation instead of the previous binary on-the-runqueue/off-the-runqueue load weight method. This will inevitably disturb workloads that were in some sort of borderline balancing state or unstable equilibrium, so an eye has to be kept on regressions. For that reason the new load average is only limited to group scheduling (shares distribution) at the moment (which was also hurting the most from the prior, crude weight calculation and whose scheduling quality wins most from this change) - but we plan to extend this to regular SMP balancing as well in the future, which will simplify and speed up things a bit. Other changes involve ongoing preparatory work to extend NOHZ to the scheduler as well, eventually allowing completely irq-free user-space execution." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" cputime: Comment cputime's adjusting code cputime: Consolidate cputime adjustment code cputime: Rename thread_group_times to thread_group_cputime_adjusted cputime: Move thread_group_cputime() to sched code vtime: Warn if irqs aren't disabled on system time accounting APIs vtime: No need to disable irqs on vtime_account() vtime: Consolidate a bit the ctx switch code vtime: Explicitly account pending user time on process tick vtime: Remove the underscore prefix invasion sched/autogroup: Fix crash on reboot when autogroup is disabled cputime: Separate irqtime accounting from generic vtime cputime: Specialize irq vtime hooks kvm: Directly account vtime to system on guest switch vtime: Make vtime_account_system() irqsafe vtime: Gather vtime declarations to their own header file sched: Describe CFS load-balancer sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking sched: Make __update_entity_runnable_avg() fast sched: Update_cfs_shares at period edge ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/auto_group.c4
-rw-r--r--kernel/sched/auto_group.h5
-rw-r--r--kernel/sched/core.c11
-rw-r--r--kernel/sched/cputime.c131
-rw-r--r--kernel/sched/debug.c36
-rw-r--r--kernel/sched/fair.c914
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/sched.h60
8 files changed, 929 insertions, 237 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 15f60d01198b..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
143 143
144 p->signal->autogroup = autogroup_kref_get(ag); 144 p->signal->autogroup = autogroup_kref_get(ag);
145 145
146 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
147 goto out;
148
146 t = p; 149 t = p;
147 do { 150 do {
148 sched_move_task(t); 151 sched_move_task(t);
149 } while_each_thread(p, t); 152 } while_each_thread(p, t);
150 153
154out:
151 unlock_task_sighand(p, &flags); 155 unlock_task_sighand(p, &flags);
152 autogroup_kref_put(prev); 156 autogroup_kref_put(prev);
153} 157}
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 443232ebbb53..8bd047142816 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,6 +4,11 @@
4#include <linux/rwsem.h> 4#include <linux/rwsem.h>
5 5
6struct autogroup { 6struct autogroup {
7 /*
8 * reference doesn't mean how many thread attach to this
9 * autogroup now. It just stands for the number of task
10 * could use this autogroup.
11 */
7 struct kref kref; 12 struct kref kref;
8 struct task_group *tg; 13 struct task_group *tg;
9 struct rw_semaphore lock; 14 struct rw_semaphore lock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 80f80dfca70e..f5066a61f971 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -953,6 +953,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
953 trace_sched_migrate_task(p, new_cpu); 953 trace_sched_migrate_task(p, new_cpu);
954 954
955 if (task_cpu(p) != new_cpu) { 955 if (task_cpu(p) != new_cpu) {
956 if (p->sched_class->migrate_task_rq)
957 p->sched_class->migrate_task_rq(p, new_cpu);
956 p->se.nr_migrations++; 958 p->se.nr_migrations++;
957 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 959 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
958 } 960 }
@@ -1525,6 +1527,15 @@ static void __sched_fork(struct task_struct *p)
1525 p->se.vruntime = 0; 1527 p->se.vruntime = 0;
1526 INIT_LIST_HEAD(&p->se.group_node); 1528 INIT_LIST_HEAD(&p->se.group_node);
1527 1529
1530/*
1531 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1532 * removed when useful for applications beyond shares distribution (e.g.
1533 * load-balance).
1534 */
1535#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1536 p->se.avg.runnable_avg_period = 0;
1537 p->se.avg.runnable_avg_sum = 0;
1538#endif
1528#ifdef CONFIG_SCHEDSTATS 1539#ifdef CONFIG_SCHEDSTATS
1529 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1540 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1530#endif 1541#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a6..293b202fcf79 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
43 * Called before incrementing preempt_count on {soft,}irq_enter 43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit. 44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */ 45 */
46void vtime_account(struct task_struct *curr) 46void irqtime_account_irq(struct task_struct *curr)
47{ 47{
48 unsigned long flags; 48 unsigned long flags;
49 s64 delta; 49 s64 delta;
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)
73 irq_time_write_end(); 73 irq_time_write_end();
74 local_irq_restore(flags); 74 local_irq_restore(flags);
75} 75}
76EXPORT_SYMBOL_GPL(vtime_account); 76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 77
78static int irqtime_account_hi_update(void) 78static int irqtime_account_hi_update(void)
79{ 79{
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void)
288 return false; 288 return false;
289} 289}
290 290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING 319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292 320
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING 321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks)
417 * Use precise platform statistics if available: 445 * Use precise platform statistics if available:
418 */ 446 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING 447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{ 449{
422 *ut = p->utime; 450 *ut = p->utime;
423 *st = p->stime; 451 *st = p->stime;
424} 452}
425 453
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{ 455{
428 struct task_cputime cputime; 456 struct task_cputime cputime;
429 457
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
433 *st = cputime.stime; 461 *st = cputime.stime;
434} 462}
435 463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
436/* 487/*
437 * Archs that account the whole time spent in the idle task 488 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement 489 * (outside irq) as idle time can rely on this and just implement
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
444#ifndef __ARCH_HAS_VTIME_ACCOUNT 495#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk) 496void vtime_account(struct task_struct *tsk)
446{ 497{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk)) 498 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk); 499 vtime_account_system(tsk);
453 else 500 else
454 vtime_account_idle(tsk); 501 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457} 502}
458EXPORT_SYMBOL_GPL(vtime_account); 503EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
478 return (__force cputime_t) temp; 523 return (__force cputime_t) temp;
479} 524}
480 525
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
482{ 533{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime; 534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
484 538
485 /* 539 /*
486 * Use CFS's precise accounting: 540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
487 */ 548 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
489 550
490 if (total) 551 if (total)
491 utime = scale_utime(utime, rtime, total); 552 utime = scale_utime(utime, rtime, total);
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
493 utime = rtime; 554 utime = rtime;
494 555
495 /* 556 /*
496 * Compare with previous values, to keep monotonicity: 557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
497 */ 560 */
498 p->prev_utime = max(p->prev_utime, utime); 561 prev->utime = max(prev->utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); 562 prev->stime = max(prev->stime, rtime - prev->utime);
500 563
501 *ut = p->prev_utime; 564 *ut = prev->utime;
502 *st = p->prev_stime; 565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
503} 577}
504 578
505/* 579/*
506 * Must be called with siglock held. 580 * Must be called with siglock held.
507 */ 581 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{ 583{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime; 584 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513 585
514 thread_group_cputime(p, &cputime); 586 thread_group_cputime(p, &cputime);
515 587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529} 588}
530#endif 589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{ 62{
63 struct sched_entity *se = tg->se[cpu]; 63 struct sched_entity *se = tg->se[cpu];
64 if (!se)
65 return;
66 64
67#define P(F) \ 65#define P(F) \
68 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
69#define PN(F) \ 67#define PN(F) \
70 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
71 69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
72 PN(se->exec_start); 78 PN(se->exec_start);
73 PN(se->vruntime); 79 PN(se->vruntime);
74 PN(se->sum_exec_runtime); 80 PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
85 P(se->statistics.wait_count); 91 P(se->statistics.wait_count);
86#endif 92#endif
87 P(se->load.weight); 93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
88#undef PN 100#undef PN
89#undef P 101#undef P
90} 102}
@@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 219#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 220#ifdef CONFIG_SMP
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", 221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
210 SPLIT_NS(cfs_rq->load_avg)); 222 cfs_rq->runnable_load_avg);
211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", 223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
212 SPLIT_NS(cfs_rq->load_period)); 224 cfs_rq->blocked_load_avg);
213 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
214 cfs_rq->load_contribution); 226 atomic64_read(&cfs_rq->tg->load_avg));
215 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
216 atomic_read(&cfs_rq->tg->load_weight)); 228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
217#endif 233#endif
218 234
219 print_cfs_group_stats(m, cpu, cfs_rq->tg); 235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..59e072b2db97 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,6 +259,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 259 return grp->my_q;
260} 260}
261 261
262static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
263 int force_update);
264
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 265static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{ 266{
264 if (!cfs_rq->on_list) { 267 if (!cfs_rq->on_list) {
@@ -278,6 +281,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
278 } 281 }
279 282
280 cfs_rq->on_list = 1; 283 cfs_rq->on_list = 1;
284 /* We should have no load, but we need to update last_decay. */
285 update_cfs_rq_blocked_load(cfs_rq, 0);
281 } 286 }
282} 287}
283 288
@@ -653,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se); 658 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654} 659}
655 660
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659/* 661/*
660 * Update the current task's runtime statistics. Skip current tasks that 662 * Update the current task's runtime statistics. Skip current tasks that
661 * are not in our scheduling class. 663 * are not in our scheduling class.
@@ -675,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
675 677
676 curr->vruntime += delta_exec_weighted; 678 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq); 679 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682} 680}
683 681
684static void update_curr(struct cfs_rq *cfs_rq) 682static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
801} 799}
802 800
803#ifdef CONFIG_FAIR_GROUP_SCHED 801#ifdef CONFIG_FAIR_GROUP_SCHED
804/* we need this in update_cfs_load and load-balance functions below */
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP 802# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834 /* truncate load history at 4 idle periods */
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850 /* consider updating load contribution on each fold or truncate */
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856 /*
857 * Inline assembly required to prevent the compiler
858 * optimising this loop into a divmod call.
859 * See __iter_div_u64_rem() for another example of this.
860 */
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 803static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{ 804{
872 long tg_weight; 805 long tg_weight;
@@ -876,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
876 * to gain a more accurate current total weight. See 809 * to gain a more accurate current total weight. See
877 * update_cfs_rq_load_contribution(). 810 * update_cfs_rq_load_contribution().
878 */ 811 */
879 tg_weight = atomic_read(&tg->load_weight); 812 tg_weight = atomic64_read(&tg->load_avg);
880 tg_weight -= cfs_rq->load_contribution; 813 tg_weight -= cfs_rq->tg_load_contrib;
881 tg_weight += cfs_rq->load.weight; 814 tg_weight += cfs_rq->load.weight;
882 815
883 return tg_weight; 816 return tg_weight;
@@ -901,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
901 834
902 return shares; 835 return shares;
903} 836}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else /* CONFIG_SMP */ 837# else /* CONFIG_SMP */
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 838static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{ 839{
919 return tg->shares; 840 return tg->shares;
920} 841}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif /* CONFIG_SMP */ 842# endif /* CONFIG_SMP */
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 843static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight) 844 unsigned long weight)
@@ -939,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
939 account_entity_enqueue(cfs_rq, se); 856 account_entity_enqueue(cfs_rq, se);
940} 857}
941 858
859static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
860
942static void update_cfs_shares(struct cfs_rq *cfs_rq) 861static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{ 862{
944 struct task_group *tg; 863 struct task_group *tg;
@@ -958,18 +877,478 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
958 reweight_entity(cfs_rq_of(se), se, shares); 877 reweight_entity(cfs_rq_of(se), se, shares);
959} 878}
960#else /* CONFIG_FAIR_GROUP_SCHED */ 879#else /* CONFIG_FAIR_GROUP_SCHED */
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 880static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
962{ 881{
963} 882}
883#endif /* CONFIG_FAIR_GROUP_SCHED */
964 884
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq) 885/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
886#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
887/*
888 * We choose a half-life close to 1 scheduling period.
889 * Note: The tables below are dependent on this value.
890 */
891#define LOAD_AVG_PERIOD 32
892#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
893#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
894
895/* Precomputed fixed inverse multiplies for multiplication by y^n */
896static const u32 runnable_avg_yN_inv[] = {
897 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
898 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
899 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
900 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
901 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
902 0x85aac367, 0x82cd8698,
903};
904
905/*
906 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
907 * over-estimates when re-combining.
908 */
909static const u32 runnable_avg_yN_sum[] = {
910 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
911 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
912 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
913};
914
915/*
916 * Approximate:
917 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
918 */
919static __always_inline u64 decay_load(u64 val, u64 n)
966{ 920{
921 unsigned int local_n;
922
923 if (!n)
924 return val;
925 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
926 return 0;
927
928 /* after bounds checking we can collapse to 32-bit */
929 local_n = n;
930
931 /*
932 * As y^PERIOD = 1/2, we can combine
933 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
934 * With a look-up table which covers k^n (n<PERIOD)
935 *
936 * To achieve constant time decay_load.
937 */
938 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
939 val >>= local_n / LOAD_AVG_PERIOD;
940 local_n %= LOAD_AVG_PERIOD;
941 }
942
943 val *= runnable_avg_yN_inv[local_n];
944 /* We don't use SRR here since we always want to round down. */
945 return val >> 32;
967} 946}
968 947
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) 948/*
949 * For updates fully spanning n periods, the contribution to runnable
950 * average will be: \Sum 1024*y^n
951 *
952 * We can compute this reasonably efficiently by combining:
953 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
954 */
955static u32 __compute_runnable_contrib(u64 n)
970{ 956{
957 u32 contrib = 0;
958
959 if (likely(n <= LOAD_AVG_PERIOD))
960 return runnable_avg_yN_sum[n];
961 else if (unlikely(n >= LOAD_AVG_MAX_N))
962 return LOAD_AVG_MAX;
963
964 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
965 do {
966 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
967 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
968
969 n -= LOAD_AVG_PERIOD;
970 } while (n > LOAD_AVG_PERIOD);
971
972 contrib = decay_load(contrib, n);
973 return contrib + runnable_avg_yN_sum[n];
971} 974}
972#endif /* CONFIG_FAIR_GROUP_SCHED */ 975
976/*
977 * We can represent the historical contribution to runnable average as the
978 * coefficients of a geometric series. To do this we sub-divide our runnable
979 * history into segments of approximately 1ms (1024us); label the segment that
980 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
981 *
982 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
983 * p0 p1 p2
984 * (now) (~1ms ago) (~2ms ago)
985 *
986 * Let u_i denote the fraction of p_i that the entity was runnable.
987 *
988 * We then designate the fractions u_i as our co-efficients, yielding the
989 * following representation of historical load:
990 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
991 *
992 * We choose y based on the with of a reasonably scheduling period, fixing:
993 * y^32 = 0.5
994 *
995 * This means that the contribution to load ~32ms ago (u_32) will be weighted
996 * approximately half as much as the contribution to load within the last ms
997 * (u_0).
998 *
999 * When a period "rolls over" and we have new u_0`, multiplying the previous
1000 * sum again by y is sufficient to update:
1001 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1002 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1003 */
1004static __always_inline int __update_entity_runnable_avg(u64 now,
1005 struct sched_avg *sa,
1006 int runnable)
1007{
1008 u64 delta, periods;
1009 u32 runnable_contrib;
1010 int delta_w, decayed = 0;
1011
1012 delta = now - sa->last_runnable_update;
1013 /*
1014 * This should only happen when time goes backwards, which it
1015 * unfortunately does during sched clock init when we swap over to TSC.
1016 */
1017 if ((s64)delta < 0) {
1018 sa->last_runnable_update = now;
1019 return 0;
1020 }
1021
1022 /*
1023 * Use 1024ns as the unit of measurement since it's a reasonable
1024 * approximation of 1us and fast to compute.
1025 */
1026 delta >>= 10;
1027 if (!delta)
1028 return 0;
1029 sa->last_runnable_update = now;
1030
1031 /* delta_w is the amount already accumulated against our next period */
1032 delta_w = sa->runnable_avg_period % 1024;
1033 if (delta + delta_w >= 1024) {
1034 /* period roll-over */
1035 decayed = 1;
1036
1037 /*
1038 * Now that we know we're crossing a period boundary, figure
1039 * out how much from delta we need to complete the current
1040 * period and accrue it.
1041 */
1042 delta_w = 1024 - delta_w;
1043 if (runnable)
1044 sa->runnable_avg_sum += delta_w;
1045 sa->runnable_avg_period += delta_w;
1046
1047 delta -= delta_w;
1048
1049 /* Figure out how many additional periods this update spans */
1050 periods = delta / 1024;
1051 delta %= 1024;
1052
1053 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1054 periods + 1);
1055 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1056 periods + 1);
1057
1058 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1059 runnable_contrib = __compute_runnable_contrib(periods);
1060 if (runnable)
1061 sa->runnable_avg_sum += runnable_contrib;
1062 sa->runnable_avg_period += runnable_contrib;
1063 }
1064
1065 /* Remainder of delta accrued against u_0` */
1066 if (runnable)
1067 sa->runnable_avg_sum += delta;
1068 sa->runnable_avg_period += delta;
1069
1070 return decayed;
1071}
1072
1073/* Synchronize an entity's decay with its parenting cfs_rq.*/
1074static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1075{
1076 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1077 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1078
1079 decays -= se->avg.decay_count;
1080 if (!decays)
1081 return 0;
1082
1083 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1084 se->avg.decay_count = 0;
1085
1086 return decays;
1087}
1088
1089#ifdef CONFIG_FAIR_GROUP_SCHED
1090static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1091 int force_update)
1092{
1093 struct task_group *tg = cfs_rq->tg;
1094 s64 tg_contrib;
1095
1096 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1097 tg_contrib -= cfs_rq->tg_load_contrib;
1098
1099 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1100 atomic64_add(tg_contrib, &tg->load_avg);
1101 cfs_rq->tg_load_contrib += tg_contrib;
1102 }
1103}
1104
1105/*
1106 * Aggregate cfs_rq runnable averages into an equivalent task_group
1107 * representation for computing load contributions.
1108 */
1109static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1110 struct cfs_rq *cfs_rq)
1111{
1112 struct task_group *tg = cfs_rq->tg;
1113 long contrib;
1114
1115 /* The fraction of a cpu used by this cfs_rq */
1116 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1117 sa->runnable_avg_period + 1);
1118 contrib -= cfs_rq->tg_runnable_contrib;
1119
1120 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1121 atomic_add(contrib, &tg->runnable_avg);
1122 cfs_rq->tg_runnable_contrib += contrib;
1123 }
1124}
1125
1126static inline void __update_group_entity_contrib(struct sched_entity *se)
1127{
1128 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1129 struct task_group *tg = cfs_rq->tg;
1130 int runnable_avg;
1131
1132 u64 contrib;
1133
1134 contrib = cfs_rq->tg_load_contrib * tg->shares;
1135 se->avg.load_avg_contrib = div64_u64(contrib,
1136 atomic64_read(&tg->load_avg) + 1);
1137
1138 /*
1139 * For group entities we need to compute a correction term in the case
1140 * that they are consuming <1 cpu so that we would contribute the same
1141 * load as a task of equal weight.
1142 *
1143 * Explicitly co-ordinating this measurement would be expensive, but
1144 * fortunately the sum of each cpus contribution forms a usable
1145 * lower-bound on the true value.
1146 *
1147 * Consider the aggregate of 2 contributions. Either they are disjoint
1148 * (and the sum represents true value) or they are disjoint and we are
1149 * understating by the aggregate of their overlap.
1150 *
1151 * Extending this to N cpus, for a given overlap, the maximum amount we
1152 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1153 * cpus that overlap for this interval and w_i is the interval width.
1154 *
1155 * On a small machine; the first term is well-bounded which bounds the
1156 * total error since w_i is a subset of the period. Whereas on a
1157 * larger machine, while this first term can be larger, if w_i is the
1158 * of consequential size guaranteed to see n_i*w_i quickly converge to
1159 * our upper bound of 1-cpu.
1160 */
1161 runnable_avg = atomic_read(&tg->runnable_avg);
1162 if (runnable_avg < NICE_0_LOAD) {
1163 se->avg.load_avg_contrib *= runnable_avg;
1164 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1165 }
1166}
1167#else
1168static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1169 int force_update) {}
1170static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1171 struct cfs_rq *cfs_rq) {}
1172static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1173#endif
1174
1175static inline void __update_task_entity_contrib(struct sched_entity *se)
1176{
1177 u32 contrib;
1178
1179 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1180 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1181 contrib /= (se->avg.runnable_avg_period + 1);
1182 se->avg.load_avg_contrib = scale_load(contrib);
1183}
1184
1185/* Compute the current contribution to load_avg by se, return any delta */
1186static long __update_entity_load_avg_contrib(struct sched_entity *se)
1187{
1188 long old_contrib = se->avg.load_avg_contrib;
1189
1190 if (entity_is_task(se)) {
1191 __update_task_entity_contrib(se);
1192 } else {
1193 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1194 __update_group_entity_contrib(se);
1195 }
1196
1197 return se->avg.load_avg_contrib - old_contrib;
1198}
1199
1200static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1201 long load_contrib)
1202{
1203 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1204 cfs_rq->blocked_load_avg -= load_contrib;
1205 else
1206 cfs_rq->blocked_load_avg = 0;
1207}
1208
1209static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1210
1211/* Update a sched_entity's runnable average */
1212static inline void update_entity_load_avg(struct sched_entity *se,
1213 int update_cfs_rq)
1214{
1215 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1216 long contrib_delta;
1217 u64 now;
1218
1219 /*
1220 * For a group entity we need to use their owned cfs_rq_clock_task() in
1221 * case they are the parent of a throttled hierarchy.
1222 */
1223 if (entity_is_task(se))
1224 now = cfs_rq_clock_task(cfs_rq);
1225 else
1226 now = cfs_rq_clock_task(group_cfs_rq(se));
1227
1228 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1229 return;
1230
1231 contrib_delta = __update_entity_load_avg_contrib(se);
1232
1233 if (!update_cfs_rq)
1234 return;
1235
1236 if (se->on_rq)
1237 cfs_rq->runnable_load_avg += contrib_delta;
1238 else
1239 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1240}
1241
1242/*
1243 * Decay the load contributed by all blocked children and account this so that
1244 * their contribution may appropriately discounted when they wake up.
1245 */
1246static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1247{
1248 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1249 u64 decays;
1250
1251 decays = now - cfs_rq->last_decay;
1252 if (!decays && !force_update)
1253 return;
1254
1255 if (atomic64_read(&cfs_rq->removed_load)) {
1256 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1257 subtract_blocked_load_contrib(cfs_rq, removed_load);
1258 }
1259
1260 if (decays) {
1261 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1262 decays);
1263 atomic64_add(decays, &cfs_rq->decay_counter);
1264 cfs_rq->last_decay = now;
1265 }
1266
1267 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1268 update_cfs_shares(cfs_rq);
1269}
1270
1271static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1272{
1273 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1274 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1275}
1276
1277/* Add the load generated by se into cfs_rq's child load-average */
1278static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1279 struct sched_entity *se,
1280 int wakeup)
1281{
1282 /*
1283 * We track migrations using entity decay_count <= 0, on a wake-up
1284 * migration we use a negative decay count to track the remote decays
1285 * accumulated while sleeping.
1286 */
1287 if (unlikely(se->avg.decay_count <= 0)) {
1288 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1289 if (se->avg.decay_count) {
1290 /*
1291 * In a wake-up migration we have to approximate the
1292 * time sleeping. This is because we can't synchronize
1293 * clock_task between the two cpus, and it is not
1294 * guaranteed to be read-safe. Instead, we can
1295 * approximate this using our carried decays, which are
1296 * explicitly atomically readable.
1297 */
1298 se->avg.last_runnable_update -= (-se->avg.decay_count)
1299 << 20;
1300 update_entity_load_avg(se, 0);
1301 /* Indicate that we're now synchronized and on-rq */
1302 se->avg.decay_count = 0;
1303 }
1304 wakeup = 0;
1305 } else {
1306 __synchronize_entity_decay(se);
1307 }
1308
1309 /* migrated tasks did not contribute to our blocked load */
1310 if (wakeup) {
1311 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1312 update_entity_load_avg(se, 0);
1313 }
1314
1315 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1316 /* we force update consideration on load-balancer moves */
1317 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1318}
1319
1320/*
1321 * Remove se's load from this cfs_rq child load-average, if the entity is
1322 * transitioning to a blocked state we track its projected decay using
1323 * blocked_load_avg.
1324 */
1325static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1326 struct sched_entity *se,
1327 int sleep)
1328{
1329 update_entity_load_avg(se, 1);
1330 /* we force update consideration on load-balancer moves */
1331 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1332
1333 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1334 if (sleep) {
1335 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1336 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1337 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1338}
1339#else
1340static inline void update_entity_load_avg(struct sched_entity *se,
1341 int update_cfs_rq) {}
1342static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1343static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1344 struct sched_entity *se,
1345 int wakeup) {}
1346static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1347 struct sched_entity *se,
1348 int sleep) {}
1349static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1350 int force_update) {}
1351#endif
973 1352
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1353static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{ 1354{
@@ -1096,9 +1475,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1096 * Update run-time statistics of the 'current'. 1475 * Update run-time statistics of the 'current'.
1097 */ 1476 */
1098 update_curr(cfs_rq); 1477 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0);
1100 account_entity_enqueue(cfs_rq, se); 1478 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq); 1479 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1102 1480
1103 if (flags & ENQUEUE_WAKEUP) { 1481 if (flags & ENQUEUE_WAKEUP) {
1104 place_entity(cfs_rq, se, 0); 1482 place_entity(cfs_rq, se, 0);
@@ -1190,9 +1568,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1190 1568
1191 if (se != cfs_rq->curr) 1569 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se); 1570 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se); 1571 account_entity_dequeue(cfs_rq, se);
1572 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1196 1573
1197 /* 1574 /*
1198 * Normalize the entity after updating the min_vruntime because the 1575 * Normalize the entity after updating the min_vruntime because the
@@ -1206,7 +1583,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1206 return_cfs_rq_runtime(cfs_rq); 1583 return_cfs_rq_runtime(cfs_rq);
1207 1584
1208 update_min_vruntime(cfs_rq); 1585 update_min_vruntime(cfs_rq);
1209 update_cfs_shares(cfs_rq); 1586 se->on_rq = 0;
1210} 1587}
1211 1588
1212/* 1589/*
@@ -1340,6 +1717,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1340 update_stats_wait_start(cfs_rq, prev); 1717 update_stats_wait_start(cfs_rq, prev);
1341 /* Put 'current' back into the tree. */ 1718 /* Put 'current' back into the tree. */
1342 __enqueue_entity(cfs_rq, prev); 1719 __enqueue_entity(cfs_rq, prev);
1720 /* in !on_rq case, update occurred at dequeue */
1721 update_entity_load_avg(prev, 1);
1343 } 1722 }
1344 cfs_rq->curr = NULL; 1723 cfs_rq->curr = NULL;
1345} 1724}
@@ -1353,9 +1732,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1353 update_curr(cfs_rq); 1732 update_curr(cfs_rq);
1354 1733
1355 /* 1734 /*
1356 * Update share accounting for long-running entities. 1735 * Ensure that runnable average is periodically updated.
1357 */ 1736 */
1358 update_entity_shares_tick(cfs_rq); 1737 update_entity_load_avg(curr, 1);
1738 update_cfs_rq_blocked_load(cfs_rq, 1);
1359 1739
1360#ifdef CONFIG_SCHED_HRTICK 1740#ifdef CONFIG_SCHED_HRTICK
1361 /* 1741 /*
@@ -1448,6 +1828,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1448 return &tg->cfs_bandwidth; 1828 return &tg->cfs_bandwidth;
1449} 1829}
1450 1830
1831/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
1832static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
1833{
1834 if (unlikely(cfs_rq->throttle_count))
1835 return cfs_rq->throttled_clock_task;
1836
1837 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
1838}
1839
1451/* returns 0 on failure to allocate runtime */ 1840/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1841static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{ 1842{
@@ -1592,14 +1981,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
1592 cfs_rq->throttle_count--; 1981 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP 1982#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) { 1983 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp; 1984 /* adjust cfs_rq_clock_task() */
1596 1985 cfs_rq->throttled_clock_task_time += rq->clock_task -
1597 /* leaving throttled state, advance shares averaging windows */ 1986 cfs_rq->throttled_clock_task;
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601 /* update entity weight now that we are on_rq again */
1602 update_cfs_shares(cfs_rq);
1603 } 1987 }
1604#endif 1988#endif
1605 1989
@@ -1611,9 +1995,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
1611 struct rq *rq = data; 1995 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 1996 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613 1997
1614 /* group is entering throttled state, record last load */ 1998 /* group is entering throttled state, stop time */
1615 if (!cfs_rq->throttle_count) 1999 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0); 2000 cfs_rq->throttled_clock_task = rq->clock_task;
1617 cfs_rq->throttle_count++; 2001 cfs_rq->throttle_count++;
1618 2002
1619 return 0; 2003 return 0;
@@ -1628,7 +2012,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1628 2012
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2013 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630 2014
1631 /* account load preceding throttle */ 2015 /* freeze hierarchy runnable averages while throttled */
1632 rcu_read_lock(); 2016 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 2017 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock(); 2018 rcu_read_unlock();
@@ -1652,7 +2036,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1652 rq->nr_running -= task_delta; 2036 rq->nr_running -= task_delta;
1653 2037
1654 cfs_rq->throttled = 1; 2038 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock; 2039 cfs_rq->throttled_clock = rq->clock;
1656 raw_spin_lock(&cfs_b->lock); 2040 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2041 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock); 2042 raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2054,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1670 2054
1671 cfs_rq->throttled = 0; 2055 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock); 2056 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; 2057 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
1674 list_del_rcu(&cfs_rq->throttled_list); 2058 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock); 2059 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677 2060
1678 update_rq_clock(rq); 2061 update_rq_clock(rq);
1679 /* update hierarchical throttle state */ 2062 /* update hierarchical throttle state */
@@ -2073,8 +2456,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
2073} 2456}
2074 2457
2075#else /* CONFIG_CFS_BANDWIDTH */ 2458#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline 2459static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} 2460{
2461 return rq_of(cfs_rq)->clock_task;
2462}
2463
2464static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2465 unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2466static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2467static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2468static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2595,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2207 if (cfs_rq_throttled(cfs_rq)) 2595 if (cfs_rq_throttled(cfs_rq))
2208 break; 2596 break;
2209 2597
2210 update_cfs_load(cfs_rq, 0); 2598 update_entity_load_avg(se, 1);
2211 update_cfs_shares(cfs_rq); 2599 update_cfs_rq_blocked_load(cfs_rq, 0);
2212 } 2600 }
2213 2601
2214 if (!se) 2602 if (!se) {
2603 update_rq_runnable_avg(rq, rq->nr_running);
2215 inc_nr_running(rq); 2604 inc_nr_running(rq);
2605 }
2216 hrtick_update(rq); 2606 hrtick_update(rq);
2217} 2607}
2218 2608
@@ -2266,12 +2656,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2266 if (cfs_rq_throttled(cfs_rq)) 2656 if (cfs_rq_throttled(cfs_rq))
2267 break; 2657 break;
2268 2658
2269 update_cfs_load(cfs_rq, 0); 2659 update_entity_load_avg(se, 1);
2270 update_cfs_shares(cfs_rq); 2660 update_cfs_rq_blocked_load(cfs_rq, 0);
2271 } 2661 }
2272 2662
2273 if (!se) 2663 if (!se) {
2274 dec_nr_running(rq); 2664 dec_nr_running(rq);
2665 update_rq_runnable_avg(rq, 1);
2666 }
2275 hrtick_update(rq); 2667 hrtick_update(rq);
2276} 2668}
2277 2669
@@ -2781,6 +3173,37 @@ unlock:
2781 3173
2782 return new_cpu; 3174 return new_cpu;
2783} 3175}
3176
3177/*
3178 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3179 * removed when useful for applications beyond shares distribution (e.g.
3180 * load-balance).
3181 */
3182#ifdef CONFIG_FAIR_GROUP_SCHED
3183/*
3184 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3185 * cfs_rq_of(p) references at time of call are still valid and identify the
3186 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3187 * other assumptions, including the state of rq->lock, should be made.
3188 */
3189static void
3190migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3191{
3192 struct sched_entity *se = &p->se;
3193 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3194
3195 /*
3196 * Load tracking: accumulate removed load so that it can be processed
3197 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3198 * to blocked load iff they have a positive decay-count. It can never
3199 * be negative here since on-rq tasks have decay-count == 0.
3200 */
3201 if (se->avg.decay_count) {
3202 se->avg.decay_count = -__synchronize_entity_decay(se);
3203 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3204 }
3205}
3206#endif
2784#endif /* CONFIG_SMP */ 3207#endif /* CONFIG_SMP */
2785 3208
2786static unsigned long 3209static unsigned long
@@ -2907,7 +3330,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2907 * Batch and idle tasks do not preempt non-idle tasks (their preemption 3330 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2908 * is driven by the tick): 3331 * is driven by the tick):
2909 */ 3332 */
2910 if (unlikely(p->policy != SCHED_NORMAL)) 3333 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
2911 return; 3334 return;
2912 3335
2913 find_matching_se(&se, &pse); 3336 find_matching_se(&se, &pse);
@@ -3033,8 +3456,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3033 3456
3034#ifdef CONFIG_SMP 3457#ifdef CONFIG_SMP
3035/************************************************** 3458/**************************************************
3036 * Fair scheduling class load-balancing methods: 3459 * Fair scheduling class load-balancing methods.
3037 */ 3460 *
3461 * BASICS
3462 *
3463 * The purpose of load-balancing is to achieve the same basic fairness the
3464 * per-cpu scheduler provides, namely provide a proportional amount of compute
3465 * time to each task. This is expressed in the following equation:
3466 *
3467 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3468 *
3469 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3470 * W_i,0 is defined as:
3471 *
3472 * W_i,0 = \Sum_j w_i,j (2)
3473 *
3474 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3475 * is derived from the nice value as per prio_to_weight[].
3476 *
3477 * The weight average is an exponential decay average of the instantaneous
3478 * weight:
3479 *
3480 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3481 *
3482 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3483 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3484 * can also include other factors [XXX].
3485 *
3486 * To achieve this balance we define a measure of imbalance which follows
3487 * directly from (1):
3488 *
3489 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3490 *
3491 * We them move tasks around to minimize the imbalance. In the continuous
3492 * function space it is obvious this converges, in the discrete case we get
3493 * a few fun cases generally called infeasible weight scenarios.
3494 *
3495 * [XXX expand on:
3496 * - infeasible weights;
3497 * - local vs global optima in the discrete case. ]
3498 *
3499 *
3500 * SCHED DOMAINS
3501 *
3502 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3503 * for all i,j solution, we create a tree of cpus that follows the hardware
3504 * topology where each level pairs two lower groups (or better). This results
3505 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3506 * tree to only the first of the previous level and we decrease the frequency
3507 * of load-balance at each level inv. proportional to the number of cpus in
3508 * the groups.
3509 *
3510 * This yields:
3511 *
3512 * log_2 n 1 n
3513 * \Sum { --- * --- * 2^i } = O(n) (5)
3514 * i = 0 2^i 2^i
3515 * `- size of each group
3516 * | | `- number of cpus doing load-balance
3517 * | `- freq
3518 * `- sum over all levels
3519 *
3520 * Coupled with a limit on how many tasks we can migrate every balance pass,
3521 * this makes (5) the runtime complexity of the balancer.
3522 *
3523 * An important property here is that each CPU is still (indirectly) connected
3524 * to every other cpu in at most O(log n) steps:
3525 *
3526 * The adjacency matrix of the resulting graph is given by:
3527 *
3528 * log_2 n
3529 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3530 * k = 0
3531 *
3532 * And you'll find that:
3533 *
3534 * A^(log_2 n)_i,j != 0 for all i,j (7)
3535 *
3536 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3537 * The task movement gives a factor of O(m), giving a convergence complexity
3538 * of:
3539 *
3540 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3541 *
3542 *
3543 * WORK CONSERVING
3544 *
3545 * In order to avoid CPUs going idle while there's still work to do, new idle
3546 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3547 * tree itself instead of relying on other CPUs to bring it work.
3548 *
3549 * This adds some complexity to both (5) and (8) but it reduces the total idle
3550 * time.
3551 *
3552 * [XXX more?]
3553 *
3554 *
3555 * CGROUPS
3556 *
3557 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3558 *
3559 * s_k,i
3560 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3561 * S_k
3562 *
3563 * Where
3564 *
3565 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3566 *
3567 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3568 *
3569 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3570 * property.
3571 *
3572 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3573 * rewrite all of this once again.]
3574 */
3038 3575
3039static unsigned long __read_mostly max_load_balance_interval = HZ/10; 3576static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3040 3577
@@ -3300,52 +3837,58 @@ next:
3300/* 3837/*
3301 * update tg->load_weight by folding this cpu's load_avg 3838 * update tg->load_weight by folding this cpu's load_avg
3302 */ 3839 */
3303static int update_shares_cpu(struct task_group *tg, int cpu) 3840static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3304{ 3841{
3305 struct cfs_rq *cfs_rq; 3842 struct sched_entity *se = tg->se[cpu];
3306 unsigned long flags; 3843 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3307 struct rq *rq;
3308
3309 if (!tg->se[cpu])
3310 return 0;
3311
3312 rq = cpu_rq(cpu);
3313 cfs_rq = tg->cfs_rq[cpu];
3314
3315 raw_spin_lock_irqsave(&rq->lock, flags);
3316 3844
3317 update_rq_clock(rq); 3845 /* throttled entities do not contribute to load */
3318 update_cfs_load(cfs_rq, 1); 3846 if (throttled_hierarchy(cfs_rq))
3847 return;
3319 3848
3320 /* 3849 update_cfs_rq_blocked_load(cfs_rq, 1);
3321 * We need to update shares after updating tg->load_weight in
3322 * order to adjust the weight of groups with long running tasks.
3323 */
3324 update_cfs_shares(cfs_rq);
3325 3850
3326 raw_spin_unlock_irqrestore(&rq->lock, flags); 3851 if (se) {
3327 3852 update_entity_load_avg(se, 1);
3328 return 0; 3853 /*
3854 * We pivot on our runnable average having decayed to zero for
3855 * list removal. This generally implies that all our children
3856 * have also been removed (modulo rounding error or bandwidth
3857 * control); however, such cases are rare and we can fix these
3858 * at enqueue.
3859 *
3860 * TODO: fix up out-of-order children on enqueue.
3861 */
3862 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
3863 list_del_leaf_cfs_rq(cfs_rq);
3864 } else {
3865 struct rq *rq = rq_of(cfs_rq);
3866 update_rq_runnable_avg(rq, rq->nr_running);
3867 }
3329} 3868}
3330 3869
3331static void update_shares(int cpu) 3870static void update_blocked_averages(int cpu)
3332{ 3871{
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq = cpu_rq(cpu); 3872 struct rq *rq = cpu_rq(cpu);
3873 struct cfs_rq *cfs_rq;
3874 unsigned long flags;
3335 3875
3336 rcu_read_lock(); 3876 raw_spin_lock_irqsave(&rq->lock, flags);
3877 update_rq_clock(rq);
3337 /* 3878 /*
3338 * Iterates the task_group tree in a bottom up fashion, see 3879 * Iterates the task_group tree in a bottom up fashion, see
3339 * list_add_leaf_cfs_rq() for details. 3880 * list_add_leaf_cfs_rq() for details.
3340 */ 3881 */
3341 for_each_leaf_cfs_rq(rq, cfs_rq) { 3882 for_each_leaf_cfs_rq(rq, cfs_rq) {
3342 /* throttled entities do not contribute to load */ 3883 /*
3343 if (throttled_hierarchy(cfs_rq)) 3884 * Note: We may want to consider periodically releasing
3344 continue; 3885 * rq->lock about these updates so that creating many task
3345 3886 * groups does not result in continually extending hold time.
3346 update_shares_cpu(cfs_rq->tg, cpu); 3887 */
3888 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3347 } 3889 }
3348 rcu_read_unlock(); 3890
3891 raw_spin_unlock_irqrestore(&rq->lock, flags);
3349} 3892}
3350 3893
3351/* 3894/*
@@ -3397,7 +3940,7 @@ static unsigned long task_h_load(struct task_struct *p)
3397 return load; 3940 return load;
3398} 3941}
3399#else 3942#else
3400static inline void update_shares(int cpu) 3943static inline void update_blocked_averages(int cpu)
3401{ 3944{
3402} 3945}
3403 3946
@@ -4457,12 +5000,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
4457 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5000 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4458 return; 5001 return;
4459 5002
5003 update_rq_runnable_avg(this_rq, 1);
5004
4460 /* 5005 /*
4461 * Drop the rq->lock, but keep IRQ/preempt disabled. 5006 * Drop the rq->lock, but keep IRQ/preempt disabled.
4462 */ 5007 */
4463 raw_spin_unlock(&this_rq->lock); 5008 raw_spin_unlock(&this_rq->lock);
4464 5009
4465 update_shares(this_cpu); 5010 update_blocked_averages(this_cpu);
4466 rcu_read_lock(); 5011 rcu_read_lock();
4467 for_each_domain(this_cpu, sd) { 5012 for_each_domain(this_cpu, sd) {
4468 unsigned long interval; 5013 unsigned long interval;
@@ -4717,7 +5262,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4717 int update_next_balance = 0; 5262 int update_next_balance = 0;
4718 int need_serialize; 5263 int need_serialize;
4719 5264
4720 update_shares(cpu); 5265 update_blocked_averages(cpu);
4721 5266
4722 rcu_read_lock(); 5267 rcu_read_lock();
4723 for_each_domain(cpu, sd) { 5268 for_each_domain(cpu, sd) {
@@ -4954,6 +5499,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5499 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5500 entity_tick(cfs_rq, se, queued);
4956 } 5501 }
5502
5503 update_rq_runnable_avg(rq, 1);
4957} 5504}
4958 5505
4959/* 5506/*
@@ -5046,6 +5593,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5046 place_entity(cfs_rq, se, 0); 5593 place_entity(cfs_rq, se, 0);
5047 se->vruntime -= cfs_rq->min_vruntime; 5594 se->vruntime -= cfs_rq->min_vruntime;
5048 } 5595 }
5596
5597#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5598 /*
5599 * Remove our load from contribution when we leave sched_fair
5600 * and ensure we don't carry in an old decay_count if we
5601 * switch back.
5602 */
5603 if (p->se.avg.decay_count) {
5604 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5605 __synchronize_entity_decay(&p->se);
5606 subtract_blocked_load_contrib(cfs_rq,
5607 p->se.avg.load_avg_contrib);
5608 }
5609#endif
5049} 5610}
5050 5611
5051/* 5612/*
@@ -5092,11 +5653,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5092#ifndef CONFIG_64BIT 5653#ifndef CONFIG_64BIT
5093 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5654 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5094#endif 5655#endif
5656#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5657 atomic64_set(&cfs_rq->decay_counter, 1);
5658 atomic64_set(&cfs_rq->removed_load, 0);
5659#endif
5095} 5660}
5096 5661
5097#ifdef CONFIG_FAIR_GROUP_SCHED 5662#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void task_move_group_fair(struct task_struct *p, int on_rq) 5663static void task_move_group_fair(struct task_struct *p, int on_rq)
5099{ 5664{
5665 struct cfs_rq *cfs_rq;
5100 /* 5666 /*
5101 * If the task was not on the rq at the time of this cgroup movement 5667 * If the task was not on the rq at the time of this cgroup movement
5102 * it must have been asleep, sleeping tasks keep their ->vruntime 5668 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5694,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5128 if (!on_rq) 5694 if (!on_rq)
5129 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5695 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p)); 5696 set_task_rq(p, task_cpu(p));
5131 if (!on_rq) 5697 if (!on_rq) {
5132 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5698 cfs_rq = cfs_rq_of(&p->se);
5699 p->se.vruntime += cfs_rq->min_vruntime;
5700#ifdef CONFIG_SMP
5701 /*
5702 * migrate_task_rq_fair() will have removed our previous
5703 * contribution, but we must synchronize for ongoing future
5704 * decay.
5705 */
5706 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5707 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5708#endif
5709 }
5133} 5710}
5134 5711
5135void free_fair_sched_group(struct task_group *tg) 5712void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +5791,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5214 5791
5215 cfs_rq->tg = tg; 5792 cfs_rq->tg = tg;
5216 cfs_rq->rq = rq; 5793 cfs_rq->rq = rq;
5217#ifdef CONFIG_SMP
5218 /* allow initial update_cfs_load() to truncate */
5219 cfs_rq->load_stamp = 1;
5220#endif
5221 init_cfs_rq_runtime(cfs_rq); 5794 init_cfs_rq_runtime(cfs_rq);
5222 5795
5223 tg->cfs_rq[cpu] = cfs_rq; 5796 tg->cfs_rq[cpu] = cfs_rq;
@@ -5264,8 +5837,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5264 se = tg->se[i]; 5837 se = tg->se[i];
5265 /* Propagate contribution to hierarchy */ 5838 /* Propagate contribution to hierarchy */
5266 raw_spin_lock_irqsave(&rq->lock, flags); 5839 raw_spin_lock_irqsave(&rq->lock, flags);
5267 for_each_sched_entity(se) 5840 for_each_sched_entity(se) {
5268 update_cfs_shares(group_cfs_rq(se)); 5841 update_cfs_shares(group_cfs_rq(se));
5842 /* update contribution to parent */
5843 update_entity_load_avg(se, 1);
5844 }
5269 raw_spin_unlock_irqrestore(&rq->lock, flags); 5845 raw_spin_unlock_irqrestore(&rq->lock, flags);
5270 } 5846 }
5271 5847
@@ -5319,7 +5895,9 @@ const struct sched_class fair_sched_class = {
5319 5895
5320#ifdef CONFIG_SMP 5896#ifdef CONFIG_SMP
5321 .select_task_rq = select_task_rq_fair, 5897 .select_task_rq = select_task_rq_fair,
5322 5898#ifdef CONFIG_FAIR_GROUP_SCHED
5899 .migrate_task_rq = migrate_task_rq_fair,
5900#endif
5323 .rq_online = rq_online_fair, 5901 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair, 5902 .rq_offline = rq_offline_fair,
5325 5903
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..e68e69ab917d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
32SCHED_FEAT(CACHE_HOT_BUDDY, true) 32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33 33
34/* 34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
35 * Use arch dependent cpu power functions 40 * Use arch dependent cpu power functions
36 */ 41 */
37SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_POWER, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..5eca173b563f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ struct task_group {
112 unsigned long shares; 112 unsigned long shares;
113 113
114 atomic_t load_weight; 114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
115#endif 117#endif
116 118
117#ifdef CONFIG_RT_GROUP_SCHED 119#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +224,29 @@ struct cfs_rq {
222 unsigned int nr_spread_over; 224 unsigned int nr_spread_over;
223#endif 225#endif
224 226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
225#ifdef CONFIG_FAIR_GROUP_SCHED 233#ifdef CONFIG_FAIR_GROUP_SCHED
226 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
227
228 /* 234 /*
229 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 235 * CFS Load tracking
230 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
231 * (like users, containers etc.) 237 * This allows for the description of both thread and group usage (in
232 * 238 * the FAIR_GROUP_SCHED case).
233 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
234 * list is used during load balance.
235 */ 239 */
236 int on_list; 240 u64 runnable_load_avg, blocked_load_avg;
237 struct list_head leaf_cfs_rq_list; 241 atomic64_t decay_counter, removed_load;
238 struct task_group *tg; /* group that "owns" this runqueue */ 242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
239 249
240#ifdef CONFIG_SMP
241 /* 250 /*
242 * h_load = weight * f(tg) 251 * h_load = weight * f(tg)
243 * 252 *
@@ -245,26 +254,30 @@ struct cfs_rq {
245 * this group. 254 * this group.
246 */ 255 */
247 unsigned long h_load; 256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
248 261
249 /* 262 /*
250 * Maintaining per-cpu shares distribution for group scheduling 263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
251 * 266 *
252 * load_stamp is the last time we updated the load average 267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
253 * load_last is the last time we updated the load average and saw load 268 * list is used during load balance.
254 * load_unacc_exec_time is currently unaccounted execution time
255 */ 269 */
256 u64 load_avg; 270 int on_list;
257 u64 load_period; 271 struct list_head leaf_cfs_rq_list;
258 u64 load_stamp, load_last, load_unacc_exec_time; 272 struct task_group *tg; /* group that "owns" this runqueue */
259 273
260 unsigned long load_contribution;
261#endif /* CONFIG_SMP */
262#ifdef CONFIG_CFS_BANDWIDTH 274#ifdef CONFIG_CFS_BANDWIDTH
263 int runtime_enabled; 275 int runtime_enabled;
264 u64 runtime_expires; 276 u64 runtime_expires;
265 s64 runtime_remaining; 277 s64 runtime_remaining;
266 278
267 u64 throttled_timestamp; 279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
268 int throttled, throttle_count; 281 int throttled, throttle_count;
269 struct list_head throttled_list; 282 struct list_head throttled_list;
270#endif /* CONFIG_CFS_BANDWIDTH */ 283#endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +480,8 @@ struct rq {
467#ifdef CONFIG_SMP 480#ifdef CONFIG_SMP
468 struct llist_head wake_list; 481 struct llist_head wake_list;
469#endif 482#endif
483
484 struct sched_avg avg;
470}; 485};
471 486
472static inline int cpu_of(struct rq *rq) 487static inline int cpu_of(struct rq *rq)
@@ -1212,4 +1227,3 @@ static inline u64 irq_time_read(int cpu)
1212} 1227}
1213#endif /* CONFIG_64BIT */ 1228#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1229#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215