diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-12 22:42:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-12 22:42:15 -0400 |
commit | b2e09f633a3994ee97fa6bc734b533d9c8e6ea0f (patch) | |
tree | 8f398d3f7ac19a4f4d64862086597f335d977203 /kernel | |
parent | 3737a12761636ebde0f09ef49daebb8eed18cc8a (diff) | |
parent | 535560d841b2d54f31280e05e9c6ffd19da0c4e7 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull more scheduler updates from Ingo Molnar:
"Second round of scheduler changes:
- try-to-wakeup and IPI reduction speedups, from Andy Lutomirski
- continued power scheduling cleanups and refactorings, from Nicolas
Pitre
- misc fixes and enhancements"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/deadline: Delete extraneous extern for to_ratio()
sched/idle: Optimize try-to-wake-up IPI
sched/idle: Simplify wake_up_idle_cpu()
sched/idle: Clear polling before descheduling the idle thread
sched, trace: Add a tracepoint for IPI-less remote wakeups
cpuidle: Set polling in poll_idle
sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...)
sched: Rename capacity related flags
sched: Final power vs. capacity cleanups
sched: Remove remaining dubious usage of "power"
sched: Let 'struct sched_group_power' care about CPU capacity
sched/fair: Disambiguate existing/remaining "capacity" usage
sched/fair: Change "has_capacity" to "has_free_capacity"
sched/fair: Remove "power" from 'struct numa_stats'
sched: Fix signedness bug in yield_to()
sched/fair: Use time_after() in record_wakee()
sched/balancing: Reduce the rate of needless idle load balancing
sched/fair: Fix unlocked reads of some cfs_b->quota/period
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 182 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 390 | ||||
-rw-r--r-- | kernel/sched/features.h | 8 | ||||
-rw-r--r-- | kernel/sched/idle.c | 30 | ||||
-rw-r--r-- | kernel/sched/rt.c | 3 | ||||
-rw-r--r-- | kernel/sched/sched.h | 24 |
7 files changed, 353 insertions, 286 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f611561ba4c..3bdf01b494fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -535,7 +535,7 @@ static inline void init_hrtick(void) | |||
535 | __old; \ | 535 | __old; \ |
536 | }) | 536 | }) |
537 | 537 | ||
538 | #ifdef TIF_POLLING_NRFLAG | 538 | #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) |
539 | /* | 539 | /* |
540 | * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, | 540 | * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, |
541 | * this avoids any races wrt polling state changes and thereby avoids | 541 | * this avoids any races wrt polling state changes and thereby avoids |
@@ -546,12 +546,44 @@ static bool set_nr_and_not_polling(struct task_struct *p) | |||
546 | struct thread_info *ti = task_thread_info(p); | 546 | struct thread_info *ti = task_thread_info(p); |
547 | return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); | 547 | return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); |
548 | } | 548 | } |
549 | |||
550 | /* | ||
551 | * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. | ||
552 | * | ||
553 | * If this returns true, then the idle task promises to call | ||
554 | * sched_ttwu_pending() and reschedule soon. | ||
555 | */ | ||
556 | static bool set_nr_if_polling(struct task_struct *p) | ||
557 | { | ||
558 | struct thread_info *ti = task_thread_info(p); | ||
559 | typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); | ||
560 | |||
561 | for (;;) { | ||
562 | if (!(val & _TIF_POLLING_NRFLAG)) | ||
563 | return false; | ||
564 | if (val & _TIF_NEED_RESCHED) | ||
565 | return true; | ||
566 | old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); | ||
567 | if (old == val) | ||
568 | break; | ||
569 | val = old; | ||
570 | } | ||
571 | return true; | ||
572 | } | ||
573 | |||
549 | #else | 574 | #else |
550 | static bool set_nr_and_not_polling(struct task_struct *p) | 575 | static bool set_nr_and_not_polling(struct task_struct *p) |
551 | { | 576 | { |
552 | set_tsk_need_resched(p); | 577 | set_tsk_need_resched(p); |
553 | return true; | 578 | return true; |
554 | } | 579 | } |
580 | |||
581 | #ifdef CONFIG_SMP | ||
582 | static bool set_nr_if_polling(struct task_struct *p) | ||
583 | { | ||
584 | return false; | ||
585 | } | ||
586 | #endif | ||
555 | #endif | 587 | #endif |
556 | 588 | ||
557 | /* | 589 | /* |
@@ -580,6 +612,8 @@ void resched_task(struct task_struct *p) | |||
580 | 612 | ||
581 | if (set_nr_and_not_polling(p)) | 613 | if (set_nr_and_not_polling(p)) |
582 | smp_send_reschedule(cpu); | 614 | smp_send_reschedule(cpu); |
615 | else | ||
616 | trace_sched_wake_idle_without_ipi(cpu); | ||
583 | } | 617 | } |
584 | 618 | ||
585 | void resched_cpu(int cpu) | 619 | void resched_cpu(int cpu) |
@@ -642,27 +676,10 @@ static void wake_up_idle_cpu(int cpu) | |||
642 | if (cpu == smp_processor_id()) | 676 | if (cpu == smp_processor_id()) |
643 | return; | 677 | return; |
644 | 678 | ||
645 | /* | 679 | if (set_nr_and_not_polling(rq->idle)) |
646 | * This is safe, as this function is called with the timer | ||
647 | * wheel base lock of (cpu) held. When the CPU is on the way | ||
648 | * to idle and has not yet set rq->curr to idle then it will | ||
649 | * be serialized on the timer wheel base lock and take the new | ||
650 | * timer into account automatically. | ||
651 | */ | ||
652 | if (rq->curr != rq->idle) | ||
653 | return; | ||
654 | |||
655 | /* | ||
656 | * We can set TIF_RESCHED on the idle task of the other CPU | ||
657 | * lockless. The worst case is that the other CPU runs the | ||
658 | * idle task through an additional NOOP schedule() | ||
659 | */ | ||
660 | set_tsk_need_resched(rq->idle); | ||
661 | |||
662 | /* NEED_RESCHED must be visible before we test polling */ | ||
663 | smp_mb(); | ||
664 | if (!tsk_is_polling(rq->idle)) | ||
665 | smp_send_reschedule(cpu); | 680 | smp_send_reschedule(cpu); |
681 | else | ||
682 | trace_sched_wake_idle_without_ipi(cpu); | ||
666 | } | 683 | } |
667 | 684 | ||
668 | static bool wake_up_full_nohz_cpu(int cpu) | 685 | static bool wake_up_full_nohz_cpu(int cpu) |
@@ -888,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
888 | rq->clock_task += delta; | 905 | rq->clock_task += delta; |
889 | 906 | ||
890 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | 907 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) |
891 | if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) | 908 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) |
892 | sched_rt_avg_update(rq, irq_delta + steal); | 909 | sched_rt_avg_update(rq, irq_delta + steal); |
893 | #endif | 910 | #endif |
894 | } | 911 | } |
@@ -1521,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1521 | } | 1538 | } |
1522 | 1539 | ||
1523 | #ifdef CONFIG_SMP | 1540 | #ifdef CONFIG_SMP |
1524 | static void sched_ttwu_pending(void) | 1541 | void sched_ttwu_pending(void) |
1525 | { | 1542 | { |
1526 | struct rq *rq = this_rq(); | 1543 | struct rq *rq = this_rq(); |
1527 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1544 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1528 | struct task_struct *p; | 1545 | struct task_struct *p; |
1546 | unsigned long flags; | ||
1529 | 1547 | ||
1530 | raw_spin_lock(&rq->lock); | 1548 | if (!llist) |
1549 | return; | ||
1550 | |||
1551 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1531 | 1552 | ||
1532 | while (llist) { | 1553 | while (llist) { |
1533 | p = llist_entry(llist, struct task_struct, wake_entry); | 1554 | p = llist_entry(llist, struct task_struct, wake_entry); |
@@ -1535,7 +1556,7 @@ static void sched_ttwu_pending(void) | |||
1535 | ttwu_do_activate(rq, p, 0); | 1556 | ttwu_do_activate(rq, p, 0); |
1536 | } | 1557 | } |
1537 | 1558 | ||
1538 | raw_spin_unlock(&rq->lock); | 1559 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1539 | } | 1560 | } |
1540 | 1561 | ||
1541 | void scheduler_ipi(void) | 1562 | void scheduler_ipi(void) |
@@ -1581,8 +1602,14 @@ void scheduler_ipi(void) | |||
1581 | 1602 | ||
1582 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 1603 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
1583 | { | 1604 | { |
1584 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) | 1605 | struct rq *rq = cpu_rq(cpu); |
1585 | smp_send_reschedule(cpu); | 1606 | |
1607 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { | ||
1608 | if (!set_nr_if_polling(rq->idle)) | ||
1609 | smp_send_reschedule(cpu); | ||
1610 | else | ||
1611 | trace_sched_wake_idle_without_ipi(cpu); | ||
1612 | } | ||
1586 | } | 1613 | } |
1587 | 1614 | ||
1588 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1615 | bool cpus_share_cache(int this_cpu, int that_cpu) |
@@ -4219,7 +4246,7 @@ EXPORT_SYMBOL(yield); | |||
4219 | * false (0) if we failed to boost the target. | 4246 | * false (0) if we failed to boost the target. |
4220 | * -ESRCH if there's no task to yield to. | 4247 | * -ESRCH if there's no task to yield to. |
4221 | */ | 4248 | */ |
4222 | bool __sched yield_to(struct task_struct *p, bool preempt) | 4249 | int __sched yield_to(struct task_struct *p, bool preempt) |
4223 | { | 4250 | { |
4224 | struct task_struct *curr = current; | 4251 | struct task_struct *curr = current; |
4225 | struct rq *rq, *p_rq; | 4252 | struct rq *rq, *p_rq; |
@@ -5245,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5245 | } | 5272 | } |
5246 | 5273 | ||
5247 | /* | 5274 | /* |
5248 | * Even though we initialize ->power to something semi-sane, | 5275 | * Even though we initialize ->capacity to something semi-sane, |
5249 | * we leave power_orig unset. This allows us to detect if | 5276 | * we leave capacity_orig unset. This allows us to detect if |
5250 | * domain iteration is still funny without causing /0 traps. | 5277 | * domain iteration is still funny without causing /0 traps. |
5251 | */ | 5278 | */ |
5252 | if (!group->sgp->power_orig) { | 5279 | if (!group->sgc->capacity_orig) { |
5253 | printk(KERN_CONT "\n"); | 5280 | printk(KERN_CONT "\n"); |
5254 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5281 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); |
5255 | "set\n"); | ||
5256 | break; | 5282 | break; |
5257 | } | 5283 | } |
5258 | 5284 | ||
@@ -5274,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5274 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 5300 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
5275 | 5301 | ||
5276 | printk(KERN_CONT " %s", str); | 5302 | printk(KERN_CONT " %s", str); |
5277 | if (group->sgp->power != SCHED_POWER_SCALE) { | 5303 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { |
5278 | printk(KERN_CONT " (cpu_power = %d)", | 5304 | printk(KERN_CONT " (cpu_capacity = %d)", |
5279 | group->sgp->power); | 5305 | group->sgc->capacity); |
5280 | } | 5306 | } |
5281 | 5307 | ||
5282 | group = group->next; | 5308 | group = group->next; |
@@ -5334,7 +5360,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
5334 | SD_BALANCE_NEWIDLE | | 5360 | SD_BALANCE_NEWIDLE | |
5335 | SD_BALANCE_FORK | | 5361 | SD_BALANCE_FORK | |
5336 | SD_BALANCE_EXEC | | 5362 | SD_BALANCE_EXEC | |
5337 | SD_SHARE_CPUPOWER | | 5363 | SD_SHARE_CPUCAPACITY | |
5338 | SD_SHARE_PKG_RESOURCES | | 5364 | SD_SHARE_PKG_RESOURCES | |
5339 | SD_SHARE_POWERDOMAIN)) { | 5365 | SD_SHARE_POWERDOMAIN)) { |
5340 | if (sd->groups != sd->groups->next) | 5366 | if (sd->groups != sd->groups->next) |
@@ -5365,7 +5391,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5365 | SD_BALANCE_NEWIDLE | | 5391 | SD_BALANCE_NEWIDLE | |
5366 | SD_BALANCE_FORK | | 5392 | SD_BALANCE_FORK | |
5367 | SD_BALANCE_EXEC | | 5393 | SD_BALANCE_EXEC | |
5368 | SD_SHARE_CPUPOWER | | 5394 | SD_SHARE_CPUCAPACITY | |
5369 | SD_SHARE_PKG_RESOURCES | | 5395 | SD_SHARE_PKG_RESOURCES | |
5370 | SD_PREFER_SIBLING | | 5396 | SD_PREFER_SIBLING | |
5371 | SD_SHARE_POWERDOMAIN); | 5397 | SD_SHARE_POWERDOMAIN); |
@@ -5490,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
5490 | return rd; | 5516 | return rd; |
5491 | } | 5517 | } |
5492 | 5518 | ||
5493 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | 5519 | static void free_sched_groups(struct sched_group *sg, int free_sgc) |
5494 | { | 5520 | { |
5495 | struct sched_group *tmp, *first; | 5521 | struct sched_group *tmp, *first; |
5496 | 5522 | ||
@@ -5501,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) | |||
5501 | do { | 5527 | do { |
5502 | tmp = sg->next; | 5528 | tmp = sg->next; |
5503 | 5529 | ||
5504 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | 5530 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) |
5505 | kfree(sg->sgp); | 5531 | kfree(sg->sgc); |
5506 | 5532 | ||
5507 | kfree(sg); | 5533 | kfree(sg); |
5508 | sg = tmp; | 5534 | sg = tmp; |
@@ -5520,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu) | |||
5520 | if (sd->flags & SD_OVERLAP) { | 5546 | if (sd->flags & SD_OVERLAP) { |
5521 | free_sched_groups(sd->groups, 1); | 5547 | free_sched_groups(sd->groups, 1); |
5522 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | 5548 | } else if (atomic_dec_and_test(&sd->groups->ref)) { |
5523 | kfree(sd->groups->sgp); | 5549 | kfree(sd->groups->sgc); |
5524 | kfree(sd->groups); | 5550 | kfree(sd->groups); |
5525 | } | 5551 | } |
5526 | kfree(sd); | 5552 | kfree(sd); |
@@ -5731,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5731 | 5757 | ||
5732 | cpumask_or(covered, covered, sg_span); | 5758 | cpumask_or(covered, covered, sg_span); |
5733 | 5759 | ||
5734 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); | 5760 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); |
5735 | if (atomic_inc_return(&sg->sgp->ref) == 1) | 5761 | if (atomic_inc_return(&sg->sgc->ref) == 1) |
5736 | build_group_mask(sd, sg); | 5762 | build_group_mask(sd, sg); |
5737 | 5763 | ||
5738 | /* | 5764 | /* |
5739 | * Initialize sgp->power such that even if we mess up the | 5765 | * Initialize sgc->capacity such that even if we mess up the |
5740 | * domains and no possible iteration will get us here, we won't | 5766 | * domains and no possible iteration will get us here, we won't |
5741 | * die on a /0 trap. | 5767 | * die on a /0 trap. |
5742 | */ | 5768 | */ |
5743 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | 5769 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
5744 | sg->sgp->power_orig = sg->sgp->power; | 5770 | sg->sgc->capacity_orig = sg->sgc->capacity; |
5745 | 5771 | ||
5746 | /* | 5772 | /* |
5747 | * Make sure the first group of this domain contains the | 5773 | * Make sure the first group of this domain contains the |
@@ -5779,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | |||
5779 | 5805 | ||
5780 | if (sg) { | 5806 | if (sg) { |
5781 | *sg = *per_cpu_ptr(sdd->sg, cpu); | 5807 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
5782 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | 5808 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
5783 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | 5809 | atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ |
5784 | } | 5810 | } |
5785 | 5811 | ||
5786 | return cpu; | 5812 | return cpu; |
@@ -5789,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | |||
5789 | /* | 5815 | /* |
5790 | * build_sched_groups will build a circular linked list of the groups | 5816 | * build_sched_groups will build a circular linked list of the groups |
5791 | * covered by the given span, and will set each group's ->cpumask correctly, | 5817 | * covered by the given span, and will set each group's ->cpumask correctly, |
5792 | * and ->cpu_power to 0. | 5818 | * and ->cpu_capacity to 0. |
5793 | * | 5819 | * |
5794 | * Assumes the sched_domain tree is fully constructed | 5820 | * Assumes the sched_domain tree is fully constructed |
5795 | */ | 5821 | */ |
@@ -5843,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5843 | } | 5869 | } |
5844 | 5870 | ||
5845 | /* | 5871 | /* |
5846 | * Initialize sched groups cpu_power. | 5872 | * Initialize sched groups cpu_capacity. |
5847 | * | 5873 | * |
5848 | * cpu_power indicates the capacity of sched group, which is used while | 5874 | * cpu_capacity indicates the capacity of sched group, which is used while |
5849 | * distributing the load between different sched groups in a sched domain. | 5875 | * distributing the load between different sched groups in a sched domain. |
5850 | * Typically cpu_power for all the groups in a sched domain will be same unless | 5876 | * Typically cpu_capacity for all the groups in a sched domain will be same |
5851 | * there are asymmetries in the topology. If there are asymmetries, group | 5877 | * unless there are asymmetries in the topology. If there are asymmetries, |
5852 | * having more cpu_power will pickup more load compared to the group having | 5878 | * group having more cpu_capacity will pickup more load compared to the |
5853 | * less cpu_power. | 5879 | * group having less cpu_capacity. |
5854 | */ | 5880 | */ |
5855 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 5881 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) |
5856 | { | 5882 | { |
5857 | struct sched_group *sg = sd->groups; | 5883 | struct sched_group *sg = sd->groups; |
5858 | 5884 | ||
@@ -5866,8 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
5866 | if (cpu != group_balance_cpu(sg)) | 5892 | if (cpu != group_balance_cpu(sg)) |
5867 | return; | 5893 | return; |
5868 | 5894 | ||
5869 | update_group_power(sd, cpu); | 5895 | update_group_capacity(sd, cpu); |
5870 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | 5896 | atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); |
5871 | } | 5897 | } |
5872 | 5898 | ||
5873 | /* | 5899 | /* |
@@ -5958,8 +5984,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
5958 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | 5984 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
5959 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 5985 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
5960 | 5986 | ||
5961 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) | 5987 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) |
5962 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; | 5988 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; |
5963 | } | 5989 | } |
5964 | 5990 | ||
5965 | #ifdef CONFIG_NUMA | 5991 | #ifdef CONFIG_NUMA |
@@ -5972,7 +5998,7 @@ static int sched_domains_curr_level; | |||
5972 | /* | 5998 | /* |
5973 | * SD_flags allowed in topology descriptions. | 5999 | * SD_flags allowed in topology descriptions. |
5974 | * | 6000 | * |
5975 | * SD_SHARE_CPUPOWER - describes SMT topologies | 6001 | * SD_SHARE_CPUCAPACITY - describes SMT topologies |
5976 | * SD_SHARE_PKG_RESOURCES - describes shared caches | 6002 | * SD_SHARE_PKG_RESOURCES - describes shared caches |
5977 | * SD_NUMA - describes NUMA topologies | 6003 | * SD_NUMA - describes NUMA topologies |
5978 | * SD_SHARE_POWERDOMAIN - describes shared power domain | 6004 | * SD_SHARE_POWERDOMAIN - describes shared power domain |
@@ -5981,7 +6007,7 @@ static int sched_domains_curr_level; | |||
5981 | * SD_ASYM_PACKING - describes SMT quirks | 6007 | * SD_ASYM_PACKING - describes SMT quirks |
5982 | */ | 6008 | */ |
5983 | #define TOPOLOGY_SD_FLAGS \ | 6009 | #define TOPOLOGY_SD_FLAGS \ |
5984 | (SD_SHARE_CPUPOWER | \ | 6010 | (SD_SHARE_CPUCAPACITY | \ |
5985 | SD_SHARE_PKG_RESOURCES | \ | 6011 | SD_SHARE_PKG_RESOURCES | \ |
5986 | SD_NUMA | \ | 6012 | SD_NUMA | \ |
5987 | SD_ASYM_PACKING | \ | 6013 | SD_ASYM_PACKING | \ |
@@ -6027,7 +6053,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6027 | | 1*SD_BALANCE_FORK | 6053 | | 1*SD_BALANCE_FORK |
6028 | | 0*SD_BALANCE_WAKE | 6054 | | 0*SD_BALANCE_WAKE |
6029 | | 1*SD_WAKE_AFFINE | 6055 | | 1*SD_WAKE_AFFINE |
6030 | | 0*SD_SHARE_CPUPOWER | 6056 | | 0*SD_SHARE_CPUCAPACITY |
6031 | | 0*SD_SHARE_PKG_RESOURCES | 6057 | | 0*SD_SHARE_PKG_RESOURCES |
6032 | | 0*SD_SERIALIZE | 6058 | | 0*SD_SERIALIZE |
6033 | | 0*SD_PREFER_SIBLING | 6059 | | 0*SD_PREFER_SIBLING |
@@ -6049,7 +6075,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6049 | * Convert topological properties into behaviour. | 6075 | * Convert topological properties into behaviour. |
6050 | */ | 6076 | */ |
6051 | 6077 | ||
6052 | if (sd->flags & SD_SHARE_CPUPOWER) { | 6078 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6053 | sd->imbalance_pct = 110; | 6079 | sd->imbalance_pct = 110; |
6054 | sd->smt_gain = 1178; /* ~15% */ | 6080 | sd->smt_gain = 1178; /* ~15% */ |
6055 | 6081 | ||
@@ -6361,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6361 | if (!sdd->sg) | 6387 | if (!sdd->sg) |
6362 | return -ENOMEM; | 6388 | return -ENOMEM; |
6363 | 6389 | ||
6364 | sdd->sgp = alloc_percpu(struct sched_group_power *); | 6390 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); |
6365 | if (!sdd->sgp) | 6391 | if (!sdd->sgc) |
6366 | return -ENOMEM; | 6392 | return -ENOMEM; |
6367 | 6393 | ||
6368 | for_each_cpu(j, cpu_map) { | 6394 | for_each_cpu(j, cpu_map) { |
6369 | struct sched_domain *sd; | 6395 | struct sched_domain *sd; |
6370 | struct sched_group *sg; | 6396 | struct sched_group *sg; |
6371 | struct sched_group_power *sgp; | 6397 | struct sched_group_capacity *sgc; |
6372 | 6398 | ||
6373 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | 6399 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), |
6374 | GFP_KERNEL, cpu_to_node(j)); | 6400 | GFP_KERNEL, cpu_to_node(j)); |
@@ -6386,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6386 | 6412 | ||
6387 | *per_cpu_ptr(sdd->sg, j) = sg; | 6413 | *per_cpu_ptr(sdd->sg, j) = sg; |
6388 | 6414 | ||
6389 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), | 6415 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), |
6390 | GFP_KERNEL, cpu_to_node(j)); | 6416 | GFP_KERNEL, cpu_to_node(j)); |
6391 | if (!sgp) | 6417 | if (!sgc) |
6392 | return -ENOMEM; | 6418 | return -ENOMEM; |
6393 | 6419 | ||
6394 | *per_cpu_ptr(sdd->sgp, j) = sgp; | 6420 | *per_cpu_ptr(sdd->sgc, j) = sgc; |
6395 | } | 6421 | } |
6396 | } | 6422 | } |
6397 | 6423 | ||
@@ -6418,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6418 | 6444 | ||
6419 | if (sdd->sg) | 6445 | if (sdd->sg) |
6420 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6446 | kfree(*per_cpu_ptr(sdd->sg, j)); |
6421 | if (sdd->sgp) | 6447 | if (sdd->sgc) |
6422 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 6448 | kfree(*per_cpu_ptr(sdd->sgc, j)); |
6423 | } | 6449 | } |
6424 | free_percpu(sdd->sd); | 6450 | free_percpu(sdd->sd); |
6425 | sdd->sd = NULL; | 6451 | sdd->sd = NULL; |
6426 | free_percpu(sdd->sg); | 6452 | free_percpu(sdd->sg); |
6427 | sdd->sg = NULL; | 6453 | sdd->sg = NULL; |
6428 | free_percpu(sdd->sgp); | 6454 | free_percpu(sdd->sgc); |
6429 | sdd->sgp = NULL; | 6455 | sdd->sgc = NULL; |
6430 | } | 6456 | } |
6431 | } | 6457 | } |
6432 | 6458 | ||
@@ -6496,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6496 | } | 6522 | } |
6497 | } | 6523 | } |
6498 | 6524 | ||
6499 | /* Calculate CPU power for physical packages and nodes */ | 6525 | /* Calculate CPU capacity for physical packages and nodes */ |
6500 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | 6526 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
6501 | if (!cpumask_test_cpu(i, cpu_map)) | 6527 | if (!cpumask_test_cpu(i, cpu_map)) |
6502 | continue; | 6528 | continue; |
6503 | 6529 | ||
6504 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | 6530 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
6505 | claim_allocations(i, sd); | 6531 | claim_allocations(i, sd); |
6506 | init_sched_groups_power(i, sd); | 6532 | init_sched_groups_capacity(i, sd); |
6507 | } | 6533 | } |
6508 | } | 6534 | } |
6509 | 6535 | ||
@@ -6946,7 +6972,7 @@ void __init sched_init(void) | |||
6946 | #ifdef CONFIG_SMP | 6972 | #ifdef CONFIG_SMP |
6947 | rq->sd = NULL; | 6973 | rq->sd = NULL; |
6948 | rq->rd = NULL; | 6974 | rq->rd = NULL; |
6949 | rq->cpu_power = SCHED_POWER_SCALE; | 6975 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; |
6950 | rq->post_schedule = 0; | 6976 | rq->post_schedule = 0; |
6951 | rq->active_balance = 0; | 6977 | rq->active_balance = 0; |
6952 | rq->next_balance = jiffies; | 6978 | rq->next_balance = jiffies; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2b8cbf09d1a4..fc4f98b1258f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | |||
57 | dl_b->dl_runtime = runtime; | 57 | dl_b->dl_runtime = runtime; |
58 | } | 58 | } |
59 | 59 | ||
60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
61 | |||
62 | void init_dl_bw(struct dl_bw *dl_b) | 60 | void init_dl_bw(struct dl_bw *dl_b) |
63 | { | 61 | { |
64 | raw_spin_lock_init(&dl_b->lock); | 62 | raw_spin_lock_init(&dl_b->lock); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9855e87d671a..fea7d3335e1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
1017 | static unsigned long weighted_cpuload(const int cpu); | 1017 | static unsigned long weighted_cpuload(const int cpu); |
1018 | static unsigned long source_load(int cpu, int type); | 1018 | static unsigned long source_load(int cpu, int type); |
1019 | static unsigned long target_load(int cpu, int type); | 1019 | static unsigned long target_load(int cpu, int type); |
1020 | static unsigned long power_of(int cpu); | 1020 | static unsigned long capacity_of(int cpu); |
1021 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | 1021 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); |
1022 | 1022 | ||
1023 | /* Cached statistics for all CPUs within a node */ | 1023 | /* Cached statistics for all CPUs within a node */ |
@@ -1026,11 +1026,11 @@ struct numa_stats { | |||
1026 | unsigned long load; | 1026 | unsigned long load; |
1027 | 1027 | ||
1028 | /* Total compute capacity of CPUs on a node */ | 1028 | /* Total compute capacity of CPUs on a node */ |
1029 | unsigned long power; | 1029 | unsigned long compute_capacity; |
1030 | 1030 | ||
1031 | /* Approximate capacity in terms of runnable tasks on a node */ | 1031 | /* Approximate capacity in terms of runnable tasks on a node */ |
1032 | unsigned long capacity; | 1032 | unsigned long task_capacity; |
1033 | int has_capacity; | 1033 | int has_free_capacity; |
1034 | }; | 1034 | }; |
1035 | 1035 | ||
1036 | /* | 1036 | /* |
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1046 | 1046 | ||
1047 | ns->nr_running += rq->nr_running; | 1047 | ns->nr_running += rq->nr_running; |
1048 | ns->load += weighted_cpuload(cpu); | 1048 | ns->load += weighted_cpuload(cpu); |
1049 | ns->power += power_of(cpu); | 1049 | ns->compute_capacity += capacity_of(cpu); |
1050 | 1050 | ||
1051 | cpus++; | 1051 | cpus++; |
1052 | } | 1052 | } |
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1056 | * the @ns structure is NULL'ed and task_numa_compare() will | 1056 | * the @ns structure is NULL'ed and task_numa_compare() will |
1057 | * not find this node attractive. | 1057 | * not find this node attractive. |
1058 | * | 1058 | * |
1059 | * We'll either bail at !has_capacity, or we'll detect a huge imbalance | 1059 | * We'll either bail at !has_free_capacity, or we'll detect a huge |
1060 | * and bail there. | 1060 | * imbalance and bail there. |
1061 | */ | 1061 | */ |
1062 | if (!cpus) | 1062 | if (!cpus) |
1063 | return; | 1063 | return; |
1064 | 1064 | ||
1065 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | 1065 | ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; |
1066 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | 1066 | ns->task_capacity = |
1067 | ns->has_capacity = (ns->nr_running < ns->capacity); | 1067 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); |
1068 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | ||
1068 | } | 1069 | } |
1069 | 1070 | ||
1070 | struct task_numa_env { | 1071 | struct task_numa_env { |
@@ -1195,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1195 | 1196 | ||
1196 | if (!cur) { | 1197 | if (!cur) { |
1197 | /* Is there capacity at our destination? */ | 1198 | /* Is there capacity at our destination? */ |
1198 | if (env->src_stats.has_capacity && | 1199 | if (env->src_stats.has_free_capacity && |
1199 | !env->dst_stats.has_capacity) | 1200 | !env->dst_stats.has_free_capacity) |
1200 | goto unlock; | 1201 | goto unlock; |
1201 | 1202 | ||
1202 | goto balance; | 1203 | goto balance; |
@@ -1213,7 +1214,7 @@ balance: | |||
1213 | orig_dst_load = env->dst_stats.load; | 1214 | orig_dst_load = env->dst_stats.load; |
1214 | orig_src_load = env->src_stats.load; | 1215 | orig_src_load = env->src_stats.load; |
1215 | 1216 | ||
1216 | /* XXX missing power terms */ | 1217 | /* XXX missing capacity terms */ |
1217 | load = task_h_load(env->p); | 1218 | load = task_h_load(env->p); |
1218 | dst_load = orig_dst_load + load; | 1219 | dst_load = orig_dst_load + load; |
1219 | src_load = orig_src_load - load; | 1220 | src_load = orig_src_load - load; |
@@ -1301,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1301 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1302 | groupimp = group_weight(p, env.dst_nid) - groupweight; |
1302 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1303 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1303 | 1304 | ||
1304 | /* If the preferred nid has capacity, try to use it. */ | 1305 | /* If the preferred nid has free capacity, try to use it. */ |
1305 | if (env.dst_stats.has_capacity) | 1306 | if (env.dst_stats.has_free_capacity) |
1306 | task_numa_find_cpu(&env, taskimp, groupimp); | 1307 | task_numa_find_cpu(&env, taskimp, groupimp); |
1307 | 1308 | ||
1308 | /* No space available on the preferred nid. Look elsewhere. */ | 1309 | /* No space available on the preferred nid. Look elsewhere. */ |
@@ -3225,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
3225 | * has not truly expired. | 3226 | * has not truly expired. |
3226 | * | 3227 | * |
3227 | * Fortunately we can check determine whether this the case by checking | 3228 | * Fortunately we can check determine whether this the case by checking |
3228 | * whether the global deadline has advanced. | 3229 | * whether the global deadline has advanced. It is valid to compare |
3230 | * cfs_b->runtime_expires without any locks since we only care about | ||
3231 | * exact equality, so a partial write will still work. | ||
3229 | */ | 3232 | */ |
3230 | 3233 | ||
3231 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | 3234 | if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { |
3232 | /* extend local deadline, drift is bounded above by 2 ticks */ | 3235 | /* extend local deadline, drift is bounded above by 2 ticks */ |
3233 | cfs_rq->runtime_expires += TICK_NSEC; | 3236 | cfs_rq->runtime_expires += TICK_NSEC; |
3234 | } else { | 3237 | } else { |
@@ -3457,21 +3460,21 @@ next: | |||
3457 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | 3460 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) |
3458 | { | 3461 | { |
3459 | u64 runtime, runtime_expires; | 3462 | u64 runtime, runtime_expires; |
3460 | int idle = 1, throttled; | 3463 | int throttled; |
3461 | 3464 | ||
3462 | raw_spin_lock(&cfs_b->lock); | ||
3463 | /* no need to continue the timer with no bandwidth constraint */ | 3465 | /* no need to continue the timer with no bandwidth constraint */ |
3464 | if (cfs_b->quota == RUNTIME_INF) | 3466 | if (cfs_b->quota == RUNTIME_INF) |
3465 | goto out_unlock; | 3467 | goto out_deactivate; |
3466 | 3468 | ||
3467 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | 3469 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); |
3468 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
3469 | idle = cfs_b->idle && !throttled; | ||
3470 | cfs_b->nr_periods += overrun; | 3470 | cfs_b->nr_periods += overrun; |
3471 | 3471 | ||
3472 | /* if we're going inactive then everything else can be deferred */ | 3472 | /* |
3473 | if (idle) | 3473 | * idle depends on !throttled (for the case of a large deficit), and if |
3474 | goto out_unlock; | 3474 | * we're going inactive then everything else can be deferred |
3475 | */ | ||
3476 | if (cfs_b->idle && !throttled) | ||
3477 | goto out_deactivate; | ||
3475 | 3478 | ||
3476 | /* | 3479 | /* |
3477 | * if we have relooped after returning idle once, we need to update our | 3480 | * if we have relooped after returning idle once, we need to update our |
@@ -3485,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3485 | if (!throttled) { | 3488 | if (!throttled) { |
3486 | /* mark as potentially idle for the upcoming period */ | 3489 | /* mark as potentially idle for the upcoming period */ |
3487 | cfs_b->idle = 1; | 3490 | cfs_b->idle = 1; |
3488 | goto out_unlock; | 3491 | return 0; |
3489 | } | 3492 | } |
3490 | 3493 | ||
3491 | /* account preceding periods in which throttling occurred */ | 3494 | /* account preceding periods in which throttling occurred */ |
@@ -3525,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3525 | * timer to remain active while there are any throttled entities.) | 3528 | * timer to remain active while there are any throttled entities.) |
3526 | */ | 3529 | */ |
3527 | cfs_b->idle = 0; | 3530 | cfs_b->idle = 0; |
3528 | out_unlock: | ||
3529 | if (idle) | ||
3530 | cfs_b->timer_active = 0; | ||
3531 | raw_spin_unlock(&cfs_b->lock); | ||
3532 | 3531 | ||
3533 | return idle; | 3532 | return 0; |
3533 | |||
3534 | out_deactivate: | ||
3535 | cfs_b->timer_active = 0; | ||
3536 | return 1; | ||
3534 | } | 3537 | } |
3535 | 3538 | ||
3536 | /* a cfs_rq won't donate quota below this amount */ | 3539 | /* a cfs_rq won't donate quota below this amount */ |
@@ -3707,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
3707 | int overrun; | 3710 | int overrun; |
3708 | int idle = 0; | 3711 | int idle = 0; |
3709 | 3712 | ||
3713 | raw_spin_lock(&cfs_b->lock); | ||
3710 | for (;;) { | 3714 | for (;;) { |
3711 | now = hrtimer_cb_get_time(timer); | 3715 | now = hrtimer_cb_get_time(timer); |
3712 | overrun = hrtimer_forward(timer, now, cfs_b->period); | 3716 | overrun = hrtimer_forward(timer, now, cfs_b->period); |
@@ -3716,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
3716 | 3720 | ||
3717 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | 3721 | idle = do_sched_cfs_period_timer(cfs_b, overrun); |
3718 | } | 3722 | } |
3723 | raw_spin_unlock(&cfs_b->lock); | ||
3719 | 3724 | ||
3720 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | 3725 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; |
3721 | } | 3726 | } |
@@ -3775,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3775 | struct cfs_rq *cfs_rq; | 3780 | struct cfs_rq *cfs_rq; |
3776 | 3781 | ||
3777 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 3782 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3778 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
3779 | |||
3780 | if (!cfs_rq->runtime_enabled) | 3783 | if (!cfs_rq->runtime_enabled) |
3781 | continue; | 3784 | continue; |
3782 | 3785 | ||
@@ -3784,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3784 | * clock_task is not advancing so we just need to make sure | 3787 | * clock_task is not advancing so we just need to make sure |
3785 | * there's some valid quota amount | 3788 | * there's some valid quota amount |
3786 | */ | 3789 | */ |
3787 | cfs_rq->runtime_remaining = cfs_b->quota; | 3790 | cfs_rq->runtime_remaining = 1; |
3788 | if (cfs_rq_throttled(cfs_rq)) | 3791 | if (cfs_rq_throttled(cfs_rq)) |
3789 | unthrottle_cfs_rq(cfs_rq); | 3792 | unthrottle_cfs_rq(cfs_rq); |
3790 | } | 3793 | } |
@@ -4041,9 +4044,9 @@ static unsigned long target_load(int cpu, int type) | |||
4041 | return max(rq->cpu_load[type-1], total); | 4044 | return max(rq->cpu_load[type-1], total); |
4042 | } | 4045 | } |
4043 | 4046 | ||
4044 | static unsigned long power_of(int cpu) | 4047 | static unsigned long capacity_of(int cpu) |
4045 | { | 4048 | { |
4046 | return cpu_rq(cpu)->cpu_power; | 4049 | return cpu_rq(cpu)->cpu_capacity; |
4047 | } | 4050 | } |
4048 | 4051 | ||
4049 | static unsigned long cpu_avg_load_per_task(int cpu) | 4052 | static unsigned long cpu_avg_load_per_task(int cpu) |
@@ -4065,7 +4068,7 @@ static void record_wakee(struct task_struct *p) | |||
4065 | * about the boundary, really active task won't care | 4068 | * about the boundary, really active task won't care |
4066 | * about the loss. | 4069 | * about the loss. |
4067 | */ | 4070 | */ |
4068 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | 4071 | if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { |
4069 | current->wakee_flips >>= 1; | 4072 | current->wakee_flips >>= 1; |
4070 | current->wakee_flip_decay_ts = jiffies; | 4073 | current->wakee_flip_decay_ts = jiffies; |
4071 | } | 4074 | } |
@@ -4286,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4286 | s64 this_eff_load, prev_eff_load; | 4289 | s64 this_eff_load, prev_eff_load; |
4287 | 4290 | ||
4288 | this_eff_load = 100; | 4291 | this_eff_load = 100; |
4289 | this_eff_load *= power_of(prev_cpu); | 4292 | this_eff_load *= capacity_of(prev_cpu); |
4290 | this_eff_load *= this_load + | 4293 | this_eff_load *= this_load + |
4291 | effective_load(tg, this_cpu, weight, weight); | 4294 | effective_load(tg, this_cpu, weight, weight); |
4292 | 4295 | ||
4293 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 4296 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; |
4294 | prev_eff_load *= power_of(this_cpu); | 4297 | prev_eff_load *= capacity_of(this_cpu); |
4295 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4298 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
4296 | 4299 | ||
4297 | balanced = this_eff_load <= prev_eff_load; | 4300 | balanced = this_eff_load <= prev_eff_load; |
@@ -4367,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
4367 | avg_load += load; | 4370 | avg_load += load; |
4368 | } | 4371 | } |
4369 | 4372 | ||
4370 | /* Adjust by relative CPU power of the group */ | 4373 | /* Adjust by relative CPU capacity of the group */ |
4371 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; | 4374 | avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; |
4372 | 4375 | ||
4373 | if (local_group) { | 4376 | if (local_group) { |
4374 | this_load = avg_load; | 4377 | this_load = avg_load; |
@@ -4948,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
4948 | * | 4951 | * |
4949 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 4952 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
4950 | * | 4953 | * |
4951 | * P_i is the cpu power (or compute capacity) of cpu i, typically it is the | 4954 | * C_i is the compute capacity of cpu i, typically it is the |
4952 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 4955 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
4953 | * can also include other factors [XXX]. | 4956 | * can also include other factors [XXX]. |
4954 | * | 4957 | * |
4955 | * To achieve this balance we define a measure of imbalance which follows | 4958 | * To achieve this balance we define a measure of imbalance which follows |
4956 | * directly from (1): | 4959 | * directly from (1): |
4957 | * | 4960 | * |
4958 | * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) | 4961 | * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) |
4959 | * | 4962 | * |
4960 | * We them move tasks around to minimize the imbalance. In the continuous | 4963 | * We them move tasks around to minimize the imbalance. In the continuous |
4961 | * function space it is obvious this converges, in the discrete case we get | 4964 | * function space it is obvious this converges, in the discrete case we get |
@@ -5530,13 +5533,13 @@ struct sg_lb_stats { | |||
5530 | unsigned long group_load; /* Total load over the CPUs of the group */ | 5533 | unsigned long group_load; /* Total load over the CPUs of the group */ |
5531 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5534 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5532 | unsigned long load_per_task; | 5535 | unsigned long load_per_task; |
5533 | unsigned long group_power; | 5536 | unsigned long group_capacity; |
5534 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5537 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5535 | unsigned int group_capacity; | 5538 | unsigned int group_capacity_factor; |
5536 | unsigned int idle_cpus; | 5539 | unsigned int idle_cpus; |
5537 | unsigned int group_weight; | 5540 | unsigned int group_weight; |
5538 | int group_imb; /* Is there an imbalance in the group ? */ | 5541 | int group_imb; /* Is there an imbalance in the group ? */ |
5539 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5542 | int group_has_free_capacity; |
5540 | #ifdef CONFIG_NUMA_BALANCING | 5543 | #ifdef CONFIG_NUMA_BALANCING |
5541 | unsigned int nr_numa_running; | 5544 | unsigned int nr_numa_running; |
5542 | unsigned int nr_preferred_running; | 5545 | unsigned int nr_preferred_running; |
@@ -5551,7 +5554,7 @@ struct sd_lb_stats { | |||
5551 | struct sched_group *busiest; /* Busiest group in this sd */ | 5554 | struct sched_group *busiest; /* Busiest group in this sd */ |
5552 | struct sched_group *local; /* Local group in this sd */ | 5555 | struct sched_group *local; /* Local group in this sd */ |
5553 | unsigned long total_load; /* Total load of all groups in sd */ | 5556 | unsigned long total_load; /* Total load of all groups in sd */ |
5554 | unsigned long total_pwr; /* Total power of all groups in sd */ | 5557 | unsigned long total_capacity; /* Total capacity of all groups in sd */ |
5555 | unsigned long avg_load; /* Average load across all groups in sd */ | 5558 | unsigned long avg_load; /* Average load across all groups in sd */ |
5556 | 5559 | ||
5557 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | 5560 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ |
@@ -5570,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
5570 | .busiest = NULL, | 5573 | .busiest = NULL, |
5571 | .local = NULL, | 5574 | .local = NULL, |
5572 | .total_load = 0UL, | 5575 | .total_load = 0UL, |
5573 | .total_pwr = 0UL, | 5576 | .total_capacity = 0UL, |
5574 | .busiest_stat = { | 5577 | .busiest_stat = { |
5575 | .avg_load = 0UL, | 5578 | .avg_load = 0UL, |
5576 | }, | 5579 | }, |
@@ -5605,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
5605 | return load_idx; | 5608 | return load_idx; |
5606 | } | 5609 | } |
5607 | 5610 | ||
5608 | static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 5611 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) |
5609 | { | 5612 | { |
5610 | return SCHED_POWER_SCALE; | 5613 | return SCHED_CAPACITY_SCALE; |
5611 | } | 5614 | } |
5612 | 5615 | ||
5613 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 5616 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) |
5614 | { | 5617 | { |
5615 | return default_scale_freq_power(sd, cpu); | 5618 | return default_scale_capacity(sd, cpu); |
5616 | } | 5619 | } |
5617 | 5620 | ||
5618 | static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | 5621 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) |
5619 | { | 5622 | { |
5620 | unsigned long weight = sd->span_weight; | 5623 | unsigned long weight = sd->span_weight; |
5621 | unsigned long smt_gain = sd->smt_gain; | 5624 | unsigned long smt_gain = sd->smt_gain; |
@@ -5625,12 +5628,12 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | |||
5625 | return smt_gain; | 5628 | return smt_gain; |
5626 | } | 5629 | } |
5627 | 5630 | ||
5628 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | 5631 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) |
5629 | { | 5632 | { |
5630 | return default_scale_smt_power(sd, cpu); | 5633 | return default_scale_smt_capacity(sd, cpu); |
5631 | } | 5634 | } |
5632 | 5635 | ||
5633 | static unsigned long scale_rt_power(int cpu) | 5636 | static unsigned long scale_rt_capacity(int cpu) |
5634 | { | 5637 | { |
5635 | struct rq *rq = cpu_rq(cpu); | 5638 | struct rq *rq = cpu_rq(cpu); |
5636 | u64 total, available, age_stamp, avg; | 5639 | u64 total, available, age_stamp, avg; |
@@ -5650,71 +5653,71 @@ static unsigned long scale_rt_power(int cpu) | |||
5650 | total = sched_avg_period() + delta; | 5653 | total = sched_avg_period() + delta; |
5651 | 5654 | ||
5652 | if (unlikely(total < avg)) { | 5655 | if (unlikely(total < avg)) { |
5653 | /* Ensures that power won't end up being negative */ | 5656 | /* Ensures that capacity won't end up being negative */ |
5654 | available = 0; | 5657 | available = 0; |
5655 | } else { | 5658 | } else { |
5656 | available = total - avg; | 5659 | available = total - avg; |
5657 | } | 5660 | } |
5658 | 5661 | ||
5659 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 5662 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) |
5660 | total = SCHED_POWER_SCALE; | 5663 | total = SCHED_CAPACITY_SCALE; |
5661 | 5664 | ||
5662 | total >>= SCHED_POWER_SHIFT; | 5665 | total >>= SCHED_CAPACITY_SHIFT; |
5663 | 5666 | ||
5664 | return div_u64(available, total); | 5667 | return div_u64(available, total); |
5665 | } | 5668 | } |
5666 | 5669 | ||
5667 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 5670 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
5668 | { | 5671 | { |
5669 | unsigned long weight = sd->span_weight; | 5672 | unsigned long weight = sd->span_weight; |
5670 | unsigned long power = SCHED_POWER_SCALE; | 5673 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
5671 | struct sched_group *sdg = sd->groups; | 5674 | struct sched_group *sdg = sd->groups; |
5672 | 5675 | ||
5673 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 5676 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { |
5674 | if (sched_feat(ARCH_POWER)) | 5677 | if (sched_feat(ARCH_CAPACITY)) |
5675 | power *= arch_scale_smt_power(sd, cpu); | 5678 | capacity *= arch_scale_smt_capacity(sd, cpu); |
5676 | else | 5679 | else |
5677 | power *= default_scale_smt_power(sd, cpu); | 5680 | capacity *= default_scale_smt_capacity(sd, cpu); |
5678 | 5681 | ||
5679 | power >>= SCHED_POWER_SHIFT; | 5682 | capacity >>= SCHED_CAPACITY_SHIFT; |
5680 | } | 5683 | } |
5681 | 5684 | ||
5682 | sdg->sgp->power_orig = power; | 5685 | sdg->sgc->capacity_orig = capacity; |
5683 | 5686 | ||
5684 | if (sched_feat(ARCH_POWER)) | 5687 | if (sched_feat(ARCH_CAPACITY)) |
5685 | power *= arch_scale_freq_power(sd, cpu); | 5688 | capacity *= arch_scale_freq_capacity(sd, cpu); |
5686 | else | 5689 | else |
5687 | power *= default_scale_freq_power(sd, cpu); | 5690 | capacity *= default_scale_capacity(sd, cpu); |
5688 | 5691 | ||
5689 | power >>= SCHED_POWER_SHIFT; | 5692 | capacity >>= SCHED_CAPACITY_SHIFT; |
5690 | 5693 | ||
5691 | power *= scale_rt_power(cpu); | 5694 | capacity *= scale_rt_capacity(cpu); |
5692 | power >>= SCHED_POWER_SHIFT; | 5695 | capacity >>= SCHED_CAPACITY_SHIFT; |
5693 | 5696 | ||
5694 | if (!power) | 5697 | if (!capacity) |
5695 | power = 1; | 5698 | capacity = 1; |
5696 | 5699 | ||
5697 | cpu_rq(cpu)->cpu_power = power; | 5700 | cpu_rq(cpu)->cpu_capacity = capacity; |
5698 | sdg->sgp->power = power; | 5701 | sdg->sgc->capacity = capacity; |
5699 | } | 5702 | } |
5700 | 5703 | ||
5701 | void update_group_power(struct sched_domain *sd, int cpu) | 5704 | void update_group_capacity(struct sched_domain *sd, int cpu) |
5702 | { | 5705 | { |
5703 | struct sched_domain *child = sd->child; | 5706 | struct sched_domain *child = sd->child; |
5704 | struct sched_group *group, *sdg = sd->groups; | 5707 | struct sched_group *group, *sdg = sd->groups; |
5705 | unsigned long power, power_orig; | 5708 | unsigned long capacity, capacity_orig; |
5706 | unsigned long interval; | 5709 | unsigned long interval; |
5707 | 5710 | ||
5708 | interval = msecs_to_jiffies(sd->balance_interval); | 5711 | interval = msecs_to_jiffies(sd->balance_interval); |
5709 | interval = clamp(interval, 1UL, max_load_balance_interval); | 5712 | interval = clamp(interval, 1UL, max_load_balance_interval); |
5710 | sdg->sgp->next_update = jiffies + interval; | 5713 | sdg->sgc->next_update = jiffies + interval; |
5711 | 5714 | ||
5712 | if (!child) { | 5715 | if (!child) { |
5713 | update_cpu_power(sd, cpu); | 5716 | update_cpu_capacity(sd, cpu); |
5714 | return; | 5717 | return; |
5715 | } | 5718 | } |
5716 | 5719 | ||
5717 | power_orig = power = 0; | 5720 | capacity_orig = capacity = 0; |
5718 | 5721 | ||
5719 | if (child->flags & SD_OVERLAP) { | 5722 | if (child->flags & SD_OVERLAP) { |
5720 | /* | 5723 | /* |
@@ -5723,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
5723 | */ | 5726 | */ |
5724 | 5727 | ||
5725 | for_each_cpu(cpu, sched_group_cpus(sdg)) { | 5728 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
5726 | struct sched_group_power *sgp; | 5729 | struct sched_group_capacity *sgc; |
5727 | struct rq *rq = cpu_rq(cpu); | 5730 | struct rq *rq = cpu_rq(cpu); |
5728 | 5731 | ||
5729 | /* | 5732 | /* |
5730 | * build_sched_domains() -> init_sched_groups_power() | 5733 | * build_sched_domains() -> init_sched_groups_capacity() |
5731 | * gets here before we've attached the domains to the | 5734 | * gets here before we've attached the domains to the |
5732 | * runqueues. | 5735 | * runqueues. |
5733 | * | 5736 | * |
5734 | * Use power_of(), which is set irrespective of domains | 5737 | * Use capacity_of(), which is set irrespective of domains |
5735 | * in update_cpu_power(). | 5738 | * in update_cpu_capacity(). |
5736 | * | 5739 | * |
5737 | * This avoids power/power_orig from being 0 and | 5740 | * This avoids capacity/capacity_orig from being 0 and |
5738 | * causing divide-by-zero issues on boot. | 5741 | * causing divide-by-zero issues on boot. |
5739 | * | 5742 | * |
5740 | * Runtime updates will correct power_orig. | 5743 | * Runtime updates will correct capacity_orig. |
5741 | */ | 5744 | */ |
5742 | if (unlikely(!rq->sd)) { | 5745 | if (unlikely(!rq->sd)) { |
5743 | power_orig += power_of(cpu); | 5746 | capacity_orig += capacity_of(cpu); |
5744 | power += power_of(cpu); | 5747 | capacity += capacity_of(cpu); |
5745 | continue; | 5748 | continue; |
5746 | } | 5749 | } |
5747 | 5750 | ||
5748 | sgp = rq->sd->groups->sgp; | 5751 | sgc = rq->sd->groups->sgc; |
5749 | power_orig += sgp->power_orig; | 5752 | capacity_orig += sgc->capacity_orig; |
5750 | power += sgp->power; | 5753 | capacity += sgc->capacity; |
5751 | } | 5754 | } |
5752 | } else { | 5755 | } else { |
5753 | /* | 5756 | /* |
@@ -5757,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
5757 | 5760 | ||
5758 | group = child->groups; | 5761 | group = child->groups; |
5759 | do { | 5762 | do { |
5760 | power_orig += group->sgp->power_orig; | 5763 | capacity_orig += group->sgc->capacity_orig; |
5761 | power += group->sgp->power; | 5764 | capacity += group->sgc->capacity; |
5762 | group = group->next; | 5765 | group = group->next; |
5763 | } while (group != child->groups); | 5766 | } while (group != child->groups); |
5764 | } | 5767 | } |
5765 | 5768 | ||
5766 | sdg->sgp->power_orig = power_orig; | 5769 | sdg->sgc->capacity_orig = capacity_orig; |
5767 | sdg->sgp->power = power; | 5770 | sdg->sgc->capacity = capacity; |
5768 | } | 5771 | } |
5769 | 5772 | ||
5770 | /* | 5773 | /* |
@@ -5778,15 +5781,15 @@ static inline int | |||
5778 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 5781 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
5779 | { | 5782 | { |
5780 | /* | 5783 | /* |
5781 | * Only siblings can have significantly less than SCHED_POWER_SCALE | 5784 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE |
5782 | */ | 5785 | */ |
5783 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 5786 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) |
5784 | return 0; | 5787 | return 0; |
5785 | 5788 | ||
5786 | /* | 5789 | /* |
5787 | * If ~90% of the cpu_power is still there, we're good. | 5790 | * If ~90% of the cpu_capacity is still there, we're good. |
5788 | */ | 5791 | */ |
5789 | if (group->sgp->power * 32 > group->sgp->power_orig * 29) | 5792 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) |
5790 | return 1; | 5793 | return 1; |
5791 | 5794 | ||
5792 | return 0; | 5795 | return 0; |
@@ -5823,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
5823 | 5826 | ||
5824 | static inline int sg_imbalanced(struct sched_group *group) | 5827 | static inline int sg_imbalanced(struct sched_group *group) |
5825 | { | 5828 | { |
5826 | return group->sgp->imbalance; | 5829 | return group->sgc->imbalance; |
5827 | } | 5830 | } |
5828 | 5831 | ||
5829 | /* | 5832 | /* |
5830 | * Compute the group capacity. | 5833 | * Compute the group capacity factor. |
5831 | * | 5834 | * |
5832 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | 5835 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by |
5833 | * first dividing out the smt factor and computing the actual number of cores | 5836 | * first dividing out the smt factor and computing the actual number of cores |
5834 | * and limit power unit capacity with that. | 5837 | * and limit unit capacity with that. |
5835 | */ | 5838 | */ |
5836 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | 5839 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) |
5837 | { | 5840 | { |
5838 | unsigned int capacity, smt, cpus; | 5841 | unsigned int capacity_factor, smt, cpus; |
5839 | unsigned int power, power_orig; | 5842 | unsigned int capacity, capacity_orig; |
5840 | 5843 | ||
5841 | power = group->sgp->power; | 5844 | capacity = group->sgc->capacity; |
5842 | power_orig = group->sgp->power_orig; | 5845 | capacity_orig = group->sgc->capacity_orig; |
5843 | cpus = group->group_weight; | 5846 | cpus = group->group_weight; |
5844 | 5847 | ||
5845 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ | 5848 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ |
5846 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); | 5849 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); |
5847 | capacity = cpus / smt; /* cores */ | 5850 | capacity_factor = cpus / smt; /* cores */ |
5848 | 5851 | ||
5849 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); | 5852 | capacity_factor = min_t(unsigned, |
5850 | if (!capacity) | 5853 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); |
5851 | capacity = fix_small_capacity(env->sd, group); | 5854 | if (!capacity_factor) |
5855 | capacity_factor = fix_small_capacity(env->sd, group); | ||
5852 | 5856 | ||
5853 | return capacity; | 5857 | return capacity_factor; |
5854 | } | 5858 | } |
5855 | 5859 | ||
5856 | /** | 5860 | /** |
@@ -5890,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5890 | sgs->idle_cpus++; | 5894 | sgs->idle_cpus++; |
5891 | } | 5895 | } |
5892 | 5896 | ||
5893 | /* Adjust by relative CPU power of the group */ | 5897 | /* Adjust by relative CPU capacity of the group */ |
5894 | sgs->group_power = group->sgp->power; | 5898 | sgs->group_capacity = group->sgc->capacity; |
5895 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5899 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; |
5896 | 5900 | ||
5897 | if (sgs->sum_nr_running) | 5901 | if (sgs->sum_nr_running) |
5898 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5902 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
@@ -5900,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5900 | sgs->group_weight = group->group_weight; | 5904 | sgs->group_weight = group->group_weight; |
5901 | 5905 | ||
5902 | sgs->group_imb = sg_imbalanced(group); | 5906 | sgs->group_imb = sg_imbalanced(group); |
5903 | sgs->group_capacity = sg_capacity(env, group); | 5907 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
5904 | 5908 | ||
5905 | if (sgs->group_capacity > sgs->sum_nr_running) | 5909 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
5906 | sgs->group_has_capacity = 1; | 5910 | sgs->group_has_free_capacity = 1; |
5907 | } | 5911 | } |
5908 | 5912 | ||
5909 | /** | 5913 | /** |
@@ -5927,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
5927 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 5931 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
5928 | return false; | 5932 | return false; |
5929 | 5933 | ||
5930 | if (sgs->sum_nr_running > sgs->group_capacity) | 5934 | if (sgs->sum_nr_running > sgs->group_capacity_factor) |
5931 | return true; | 5935 | return true; |
5932 | 5936 | ||
5933 | if (sgs->group_imb) | 5937 | if (sgs->group_imb) |
@@ -6007,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6007 | sgs = &sds->local_stat; | 6011 | sgs = &sds->local_stat; |
6008 | 6012 | ||
6009 | if (env->idle != CPU_NEWLY_IDLE || | 6013 | if (env->idle != CPU_NEWLY_IDLE || |
6010 | time_after_eq(jiffies, sg->sgp->next_update)) | 6014 | time_after_eq(jiffies, sg->sgc->next_update)) |
6011 | update_group_power(env->sd, env->dst_cpu); | 6015 | update_group_capacity(env->sd, env->dst_cpu); |
6012 | } | 6016 | } |
6013 | 6017 | ||
6014 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 6018 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
@@ -6018,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6018 | 6022 | ||
6019 | /* | 6023 | /* |
6020 | * In case the child domain prefers tasks go to siblings | 6024 | * In case the child domain prefers tasks go to siblings |
6021 | * first, lower the sg capacity to one so that we'll try | 6025 | * first, lower the sg capacity factor to one so that we'll try |
6022 | * and move all the excess tasks away. We lower the capacity | 6026 | * and move all the excess tasks away. We lower the capacity |
6023 | * of a group only if the local group has the capacity to fit | 6027 | * of a group only if the local group has the capacity to fit |
6024 | * these excess tasks, i.e. nr_running < group_capacity. The | 6028 | * these excess tasks, i.e. nr_running < group_capacity_factor. The |
6025 | * extra check prevents the case where you always pull from the | 6029 | * extra check prevents the case where you always pull from the |
6026 | * heaviest group when it is already under-utilized (possible | 6030 | * heaviest group when it is already under-utilized (possible |
6027 | * with a large weight task outweighs the tasks on the system). | 6031 | * with a large weight task outweighs the tasks on the system). |
6028 | */ | 6032 | */ |
6029 | if (prefer_sibling && sds->local && | 6033 | if (prefer_sibling && sds->local && |
6030 | sds->local_stat.group_has_capacity) | 6034 | sds->local_stat.group_has_free_capacity) |
6031 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 6035 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); |
6032 | 6036 | ||
6033 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6037 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
6034 | sds->busiest = sg; | 6038 | sds->busiest = sg; |
@@ -6038,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6038 | next_group: | 6042 | next_group: |
6039 | /* Now, start updating sd_lb_stats */ | 6043 | /* Now, start updating sd_lb_stats */ |
6040 | sds->total_load += sgs->group_load; | 6044 | sds->total_load += sgs->group_load; |
6041 | sds->total_pwr += sgs->group_power; | 6045 | sds->total_capacity += sgs->group_capacity; |
6042 | 6046 | ||
6043 | sg = sg->next; | 6047 | sg = sg->next; |
6044 | } while (sg != env->sd->groups); | 6048 | } while (sg != env->sd->groups); |
@@ -6085,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
6085 | return 0; | 6089 | return 0; |
6086 | 6090 | ||
6087 | env->imbalance = DIV_ROUND_CLOSEST( | 6091 | env->imbalance = DIV_ROUND_CLOSEST( |
6088 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, | 6092 | sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, |
6089 | SCHED_POWER_SCALE); | 6093 | SCHED_CAPACITY_SCALE); |
6090 | 6094 | ||
6091 | return 1; | 6095 | return 1; |
6092 | } | 6096 | } |
@@ -6101,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
6101 | static inline | 6105 | static inline |
6102 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 6106 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
6103 | { | 6107 | { |
6104 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 6108 | unsigned long tmp, capa_now = 0, capa_move = 0; |
6105 | unsigned int imbn = 2; | 6109 | unsigned int imbn = 2; |
6106 | unsigned long scaled_busy_load_per_task; | 6110 | unsigned long scaled_busy_load_per_task; |
6107 | struct sg_lb_stats *local, *busiest; | 6111 | struct sg_lb_stats *local, *busiest; |
@@ -6115,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
6115 | imbn = 1; | 6119 | imbn = 1; |
6116 | 6120 | ||
6117 | scaled_busy_load_per_task = | 6121 | scaled_busy_load_per_task = |
6118 | (busiest->load_per_task * SCHED_POWER_SCALE) / | 6122 | (busiest->load_per_task * SCHED_CAPACITY_SCALE) / |
6119 | busiest->group_power; | 6123 | busiest->group_capacity; |
6120 | 6124 | ||
6121 | if (busiest->avg_load + scaled_busy_load_per_task >= | 6125 | if (busiest->avg_load + scaled_busy_load_per_task >= |
6122 | local->avg_load + (scaled_busy_load_per_task * imbn)) { | 6126 | local->avg_load + (scaled_busy_load_per_task * imbn)) { |
@@ -6126,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
6126 | 6130 | ||
6127 | /* | 6131 | /* |
6128 | * OK, we don't have enough imbalance to justify moving tasks, | 6132 | * OK, we don't have enough imbalance to justify moving tasks, |
6129 | * however we may be able to increase total CPU power used by | 6133 | * however we may be able to increase total CPU capacity used by |
6130 | * moving them. | 6134 | * moving them. |
6131 | */ | 6135 | */ |
6132 | 6136 | ||
6133 | pwr_now += busiest->group_power * | 6137 | capa_now += busiest->group_capacity * |
6134 | min(busiest->load_per_task, busiest->avg_load); | 6138 | min(busiest->load_per_task, busiest->avg_load); |
6135 | pwr_now += local->group_power * | 6139 | capa_now += local->group_capacity * |
6136 | min(local->load_per_task, local->avg_load); | 6140 | min(local->load_per_task, local->avg_load); |
6137 | pwr_now /= SCHED_POWER_SCALE; | 6141 | capa_now /= SCHED_CAPACITY_SCALE; |
6138 | 6142 | ||
6139 | /* Amount of load we'd subtract */ | 6143 | /* Amount of load we'd subtract */ |
6140 | if (busiest->avg_load > scaled_busy_load_per_task) { | 6144 | if (busiest->avg_load > scaled_busy_load_per_task) { |
6141 | pwr_move += busiest->group_power * | 6145 | capa_move += busiest->group_capacity * |
6142 | min(busiest->load_per_task, | 6146 | min(busiest->load_per_task, |
6143 | busiest->avg_load - scaled_busy_load_per_task); | 6147 | busiest->avg_load - scaled_busy_load_per_task); |
6144 | } | 6148 | } |
6145 | 6149 | ||
6146 | /* Amount of load we'd add */ | 6150 | /* Amount of load we'd add */ |
6147 | if (busiest->avg_load * busiest->group_power < | 6151 | if (busiest->avg_load * busiest->group_capacity < |
6148 | busiest->load_per_task * SCHED_POWER_SCALE) { | 6152 | busiest->load_per_task * SCHED_CAPACITY_SCALE) { |
6149 | tmp = (busiest->avg_load * busiest->group_power) / | 6153 | tmp = (busiest->avg_load * busiest->group_capacity) / |
6150 | local->group_power; | 6154 | local->group_capacity; |
6151 | } else { | 6155 | } else { |
6152 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / | 6156 | tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / |
6153 | local->group_power; | 6157 | local->group_capacity; |
6154 | } | 6158 | } |
6155 | pwr_move += local->group_power * | 6159 | capa_move += local->group_capacity * |
6156 | min(local->load_per_task, local->avg_load + tmp); | 6160 | min(local->load_per_task, local->avg_load + tmp); |
6157 | pwr_move /= SCHED_POWER_SCALE; | 6161 | capa_move /= SCHED_CAPACITY_SCALE; |
6158 | 6162 | ||
6159 | /* Move if we gain throughput */ | 6163 | /* Move if we gain throughput */ |
6160 | if (pwr_move > pwr_now) | 6164 | if (capa_move > capa_now) |
6161 | env->imbalance = busiest->load_per_task; | 6165 | env->imbalance = busiest->load_per_task; |
6162 | } | 6166 | } |
6163 | 6167 | ||
@@ -6187,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6187 | /* | 6191 | /* |
6188 | * In the presence of smp nice balancing, certain scenarios can have | 6192 | * In the presence of smp nice balancing, certain scenarios can have |
6189 | * max load less than avg load(as we skip the groups at or below | 6193 | * max load less than avg load(as we skip the groups at or below |
6190 | * its cpu_power, while calculating max_load..) | 6194 | * its cpu_capacity, while calculating max_load..) |
6191 | */ | 6195 | */ |
6192 | if (busiest->avg_load <= sds->avg_load || | 6196 | if (busiest->avg_load <= sds->avg_load || |
6193 | local->avg_load >= sds->avg_load) { | 6197 | local->avg_load >= sds->avg_load) { |
@@ -6202,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6202 | * have to drop below capacity to reach cpu-load equilibrium. | 6206 | * have to drop below capacity to reach cpu-load equilibrium. |
6203 | */ | 6207 | */ |
6204 | load_above_capacity = | 6208 | load_above_capacity = |
6205 | (busiest->sum_nr_running - busiest->group_capacity); | 6209 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
6206 | 6210 | ||
6207 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 6211 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); |
6208 | load_above_capacity /= busiest->group_power; | 6212 | load_above_capacity /= busiest->group_capacity; |
6209 | } | 6213 | } |
6210 | 6214 | ||
6211 | /* | 6215 | /* |
@@ -6220,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6220 | 6224 | ||
6221 | /* How much load to actually move to equalise the imbalance */ | 6225 | /* How much load to actually move to equalise the imbalance */ |
6222 | env->imbalance = min( | 6226 | env->imbalance = min( |
6223 | max_pull * busiest->group_power, | 6227 | max_pull * busiest->group_capacity, |
6224 | (sds->avg_load - local->avg_load) * local->group_power | 6228 | (sds->avg_load - local->avg_load) * local->group_capacity |
6225 | ) / SCHED_POWER_SCALE; | 6229 | ) / SCHED_CAPACITY_SCALE; |
6226 | 6230 | ||
6227 | /* | 6231 | /* |
6228 | * if *imbalance is less than the average load per runnable task | 6232 | * if *imbalance is less than the average load per runnable task |
@@ -6276,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6276 | if (!sds.busiest || busiest->sum_nr_running == 0) | 6280 | if (!sds.busiest || busiest->sum_nr_running == 0) |
6277 | goto out_balanced; | 6281 | goto out_balanced; |
6278 | 6282 | ||
6279 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 6283 | sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) |
6284 | / sds.total_capacity; | ||
6280 | 6285 | ||
6281 | /* | 6286 | /* |
6282 | * If the busiest group is imbalanced the below checks don't | 6287 | * If the busiest group is imbalanced the below checks don't |
@@ -6287,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6287 | goto force_balance; | 6292 | goto force_balance; |
6288 | 6293 | ||
6289 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6294 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
6290 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && | 6295 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && |
6291 | !busiest->group_has_capacity) | 6296 | !busiest->group_has_free_capacity) |
6292 | goto force_balance; | 6297 | goto force_balance; |
6293 | 6298 | ||
6294 | /* | 6299 | /* |
@@ -6342,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6342 | struct sched_group *group) | 6347 | struct sched_group *group) |
6343 | { | 6348 | { |
6344 | struct rq *busiest = NULL, *rq; | 6349 | struct rq *busiest = NULL, *rq; |
6345 | unsigned long busiest_load = 0, busiest_power = 1; | 6350 | unsigned long busiest_load = 0, busiest_capacity = 1; |
6346 | int i; | 6351 | int i; |
6347 | 6352 | ||
6348 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6353 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
6349 | unsigned long power, capacity, wl; | 6354 | unsigned long capacity, capacity_factor, wl; |
6350 | enum fbq_type rt; | 6355 | enum fbq_type rt; |
6351 | 6356 | ||
6352 | rq = cpu_rq(i); | 6357 | rq = cpu_rq(i); |
@@ -6374,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6374 | if (rt > env->fbq_type) | 6379 | if (rt > env->fbq_type) |
6375 | continue; | 6380 | continue; |
6376 | 6381 | ||
6377 | power = power_of(i); | 6382 | capacity = capacity_of(i); |
6378 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 6383 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); |
6379 | if (!capacity) | 6384 | if (!capacity_factor) |
6380 | capacity = fix_small_capacity(env->sd, group); | 6385 | capacity_factor = fix_small_capacity(env->sd, group); |
6381 | 6386 | ||
6382 | wl = weighted_cpuload(i); | 6387 | wl = weighted_cpuload(i); |
6383 | 6388 | ||
6384 | /* | 6389 | /* |
6385 | * When comparing with imbalance, use weighted_cpuload() | 6390 | * When comparing with imbalance, use weighted_cpuload() |
6386 | * which is not scaled with the cpu power. | 6391 | * which is not scaled with the cpu capacity. |
6387 | */ | 6392 | */ |
6388 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) | 6393 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) |
6389 | continue; | 6394 | continue; |
6390 | 6395 | ||
6391 | /* | 6396 | /* |
6392 | * For the load comparisons with the other cpu's, consider | 6397 | * For the load comparisons with the other cpu's, consider |
6393 | * the weighted_cpuload() scaled with the cpu power, so that | 6398 | * the weighted_cpuload() scaled with the cpu capacity, so |
6394 | * the load can be moved away from the cpu that is potentially | 6399 | * that the load can be moved away from the cpu that is |
6395 | * running at a lower capacity. | 6400 | * potentially running at a lower capacity. |
6396 | * | 6401 | * |
6397 | * Thus we're looking for max(wl_i / power_i), crosswise | 6402 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
6398 | * multiplication to rid ourselves of the division works out | 6403 | * multiplication to rid ourselves of the division works out |
6399 | * to: wl_i * power_j > wl_j * power_i; where j is our | 6404 | * to: wl_i * capacity_j > wl_j * capacity_i; where j is |
6400 | * previous maximum. | 6405 | * our previous maximum. |
6401 | */ | 6406 | */ |
6402 | if (wl * busiest_power > busiest_load * power) { | 6407 | if (wl * busiest_capacity > busiest_load * capacity) { |
6403 | busiest_load = wl; | 6408 | busiest_load = wl; |
6404 | busiest_power = power; | 6409 | busiest_capacity = capacity; |
6405 | busiest = rq; | 6410 | busiest = rq; |
6406 | } | 6411 | } |
6407 | } | 6412 | } |
@@ -6609,7 +6614,7 @@ more_balance: | |||
6609 | * We failed to reach balance because of affinity. | 6614 | * We failed to reach balance because of affinity. |
6610 | */ | 6615 | */ |
6611 | if (sd_parent) { | 6616 | if (sd_parent) { |
6612 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | 6617 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
6613 | 6618 | ||
6614 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6619 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { |
6615 | *group_imbalance = 1; | 6620 | *group_imbalance = 1; |
@@ -6996,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void) | |||
6996 | goto unlock; | 7001 | goto unlock; |
6997 | sd->nohz_idle = 0; | 7002 | sd->nohz_idle = 0; |
6998 | 7003 | ||
6999 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | 7004 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); |
7000 | unlock: | 7005 | unlock: |
7001 | rcu_read_unlock(); | 7006 | rcu_read_unlock(); |
7002 | } | 7007 | } |
@@ -7013,7 +7018,7 @@ void set_cpu_sd_state_idle(void) | |||
7013 | goto unlock; | 7018 | goto unlock; |
7014 | sd->nohz_idle = 1; | 7019 | sd->nohz_idle = 1; |
7015 | 7020 | ||
7016 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | 7021 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); |
7017 | unlock: | 7022 | unlock: |
7018 | rcu_read_unlock(); | 7023 | rcu_read_unlock(); |
7019 | } | 7024 | } |
@@ -7192,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7192 | 7197 | ||
7193 | rq = cpu_rq(balance_cpu); | 7198 | rq = cpu_rq(balance_cpu); |
7194 | 7199 | ||
7195 | raw_spin_lock_irq(&rq->lock); | 7200 | /* |
7196 | update_rq_clock(rq); | 7201 | * If time for next balance is due, |
7197 | update_idle_cpu_load(rq); | 7202 | * do the balance. |
7198 | raw_spin_unlock_irq(&rq->lock); | 7203 | */ |
7199 | 7204 | if (time_after_eq(jiffies, rq->next_balance)) { | |
7200 | rebalance_domains(rq, CPU_IDLE); | 7205 | raw_spin_lock_irq(&rq->lock); |
7206 | update_rq_clock(rq); | ||
7207 | update_idle_cpu_load(rq); | ||
7208 | raw_spin_unlock_irq(&rq->lock); | ||
7209 | rebalance_domains(rq, CPU_IDLE); | ||
7210 | } | ||
7201 | 7211 | ||
7202 | if (time_after(this_rq->next_balance, rq->next_balance)) | 7212 | if (time_after(this_rq->next_balance, rq->next_balance)) |
7203 | this_rq->next_balance = rq->next_balance; | 7213 | this_rq->next_balance = rq->next_balance; |
@@ -7212,7 +7222,7 @@ end: | |||
7212 | * of an idle cpu is the system. | 7222 | * of an idle cpu is the system. |
7213 | * - This rq has more than one task. | 7223 | * - This rq has more than one task. |
7214 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7224 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
7215 | * busy cpu's exceeding the group's power. | 7225 | * busy cpu's exceeding the group's capacity. |
7216 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7226 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
7217 | * domain span are idle. | 7227 | * domain span are idle. |
7218 | */ | 7228 | */ |
@@ -7220,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7220 | { | 7230 | { |
7221 | unsigned long now = jiffies; | 7231 | unsigned long now = jiffies; |
7222 | struct sched_domain *sd; | 7232 | struct sched_domain *sd; |
7223 | struct sched_group_power *sgp; | 7233 | struct sched_group_capacity *sgc; |
7224 | int nr_busy, cpu = rq->cpu; | 7234 | int nr_busy, cpu = rq->cpu; |
7225 | 7235 | ||
7226 | if (unlikely(rq->idle_balance)) | 7236 | if (unlikely(rq->idle_balance)) |
@@ -7250,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7250 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7260 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
7251 | 7261 | ||
7252 | if (sd) { | 7262 | if (sd) { |
7253 | sgp = sd->groups->sgp; | 7263 | sgc = sd->groups->sgc; |
7254 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | 7264 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
7255 | 7265 | ||
7256 | if (nr_busy > 1) | 7266 | if (nr_busy > 1) |
7257 | goto need_kick_unlock; | 7267 | goto need_kick_unlock; |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3a..90284d117fe6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Use arch dependent cpu power functions | 40 | * Use arch dependent cpu capacity functions |
41 | */ | 41 | */ |
42 | SCHED_FEAT(ARCH_POWER, true) | 42 | SCHED_FEAT(ARCH_CAPACITY, true) |
43 | 43 | ||
44 | SCHED_FEAT(HRTICK, false) | 44 | SCHED_FEAT(HRTICK, false) |
45 | SCHED_FEAT(DOUBLE_TICK, false) | 45 | SCHED_FEAT(DOUBLE_TICK, false) |
46 | SCHED_FEAT(LB_BIAS, true) | 46 | SCHED_FEAT(LB_BIAS, true) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Decrement CPU power based on time not spent running tasks | 49 | * Decrement CPU capacity based on time not spent running tasks |
50 | */ | 50 | */ |
51 | SCHED_FEAT(NONTASK_POWER, true) | 51 | SCHED_FEAT(NONTASK_CAPACITY, true) |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Queue remote wakeups on the target CPU and process them | 54 | * Queue remote wakeups on the target CPU and process them |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 25b9423abce9..cf009fb0bc25 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -12,6 +12,8 @@ | |||
12 | 12 | ||
13 | #include <trace/events/power.h> | 13 | #include <trace/events/power.h> |
14 | 14 | ||
15 | #include "sched.h" | ||
16 | |||
15 | static int __read_mostly cpu_idle_force_poll; | 17 | static int __read_mostly cpu_idle_force_poll; |
16 | 18 | ||
17 | void cpu_idle_poll_ctrl(bool enable) | 19 | void cpu_idle_poll_ctrl(bool enable) |
@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void) | |||
67 | * cpuidle_idle_call - the main idle function | 69 | * cpuidle_idle_call - the main idle function |
68 | * | 70 | * |
69 | * NOTE: no locks or semaphores should be used here | 71 | * NOTE: no locks or semaphores should be used here |
72 | * | ||
73 | * On archs that support TIF_POLLING_NRFLAG, is called with polling | ||
74 | * set, and it returns with polling set. If it ever stops polling, it | ||
75 | * must clear the polling bit. | ||
70 | */ | 76 | */ |
71 | static void cpuidle_idle_call(void) | 77 | static void cpuidle_idle_call(void) |
72 | { | 78 | { |
@@ -175,10 +181,22 @@ exit_idle: | |||
175 | 181 | ||
176 | /* | 182 | /* |
177 | * Generic idle loop implementation | 183 | * Generic idle loop implementation |
184 | * | ||
185 | * Called with polling cleared. | ||
178 | */ | 186 | */ |
179 | static void cpu_idle_loop(void) | 187 | static void cpu_idle_loop(void) |
180 | { | 188 | { |
181 | while (1) { | 189 | while (1) { |
190 | /* | ||
191 | * If the arch has a polling bit, we maintain an invariant: | ||
192 | * | ||
193 | * Our polling bit is clear if we're not scheduled (i.e. if | ||
194 | * rq->curr != rq->idle). This means that, if rq->idle has | ||
195 | * the polling bit set, then setting need_resched is | ||
196 | * guaranteed to cause the cpu to reschedule. | ||
197 | */ | ||
198 | |||
199 | __current_set_polling(); | ||
182 | tick_nohz_idle_enter(); | 200 | tick_nohz_idle_enter(); |
183 | 201 | ||
184 | while (!need_resched()) { | 202 | while (!need_resched()) { |
@@ -218,6 +236,17 @@ static void cpu_idle_loop(void) | |||
218 | */ | 236 | */ |
219 | preempt_set_need_resched(); | 237 | preempt_set_need_resched(); |
220 | tick_nohz_idle_exit(); | 238 | tick_nohz_idle_exit(); |
239 | __current_clr_polling(); | ||
240 | |||
241 | /* | ||
242 | * We promise to call sched_ttwu_pending and reschedule | ||
243 | * if need_resched is set while polling is set. That | ||
244 | * means that clearing polling needs to be visible | ||
245 | * before doing these things. | ||
246 | */ | ||
247 | smp_mb__after_atomic(); | ||
248 | |||
249 | sched_ttwu_pending(); | ||
221 | schedule_preempt_disabled(); | 250 | schedule_preempt_disabled(); |
222 | } | 251 | } |
223 | } | 252 | } |
@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
239 | */ | 268 | */ |
240 | boot_init_stack_canary(); | 269 | boot_init_stack_canary(); |
241 | #endif | 270 | #endif |
242 | __current_set_polling(); | ||
243 | arch_cpu_idle_prepare(); | 271 | arch_cpu_idle_prepare(); |
244 | cpu_idle_loop(); | 272 | cpu_idle_loop(); |
245 | } | 273 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b3512f1afce9..a49083192c64 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq) | |||
918 | { | 918 | { |
919 | struct task_struct *curr = rq->curr; | 919 | struct task_struct *curr = rq->curr; |
920 | struct sched_rt_entity *rt_se = &curr->rt; | 920 | struct sched_rt_entity *rt_se = &curr->rt; |
921 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
922 | u64 delta_exec; | 921 | u64 delta_exec; |
923 | 922 | ||
924 | if (curr->sched_class != &rt_sched_class) | 923 | if (curr->sched_class != &rt_sched_class) |
@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq) | |||
943 | return; | 942 | return; |
944 | 943 | ||
945 | for_each_sched_rt_entity(rt_se) { | 944 | for_each_sched_rt_entity(rt_se) { |
946 | rt_rq = rt_rq_of_se(rt_se); | 945 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
947 | 946 | ||
948 | if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { | 947 | if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { |
949 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 948 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e47679b04d16..31cc02ebc54e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -567,7 +567,7 @@ struct rq { | |||
567 | struct root_domain *rd; | 567 | struct root_domain *rd; |
568 | struct sched_domain *sd; | 568 | struct sched_domain *sd; |
569 | 569 | ||
570 | unsigned long cpu_power; | 570 | unsigned long cpu_capacity; |
571 | 571 | ||
572 | unsigned char idle_balance; | 572 | unsigned char idle_balance; |
573 | /* For active balancing */ | 573 | /* For active balancing */ |
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); | |||
670 | 670 | ||
671 | #ifdef CONFIG_SMP | 671 | #ifdef CONFIG_SMP |
672 | 672 | ||
673 | extern void sched_ttwu_pending(void); | ||
674 | |||
673 | #define rcu_dereference_check_sched_domain(p) \ | 675 | #define rcu_dereference_check_sched_domain(p) \ |
674 | rcu_dereference_check((p), \ | 676 | rcu_dereference_check((p), \ |
675 | lockdep_is_held(&sched_domains_mutex)) | 677 | lockdep_is_held(&sched_domains_mutex)) |
@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
728 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | 730 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); |
729 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 731 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
730 | 732 | ||
731 | struct sched_group_power { | 733 | struct sched_group_capacity { |
732 | atomic_t ref; | 734 | atomic_t ref; |
733 | /* | 735 | /* |
734 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 736 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
735 | * single CPU. | 737 | * for a single CPU. |
736 | */ | 738 | */ |
737 | unsigned int power, power_orig; | 739 | unsigned int capacity, capacity_orig; |
738 | unsigned long next_update; | 740 | unsigned long next_update; |
739 | int imbalance; /* XXX unrelated to power but shared group state */ | 741 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
740 | /* | 742 | /* |
741 | * Number of busy cpus in this group. | 743 | * Number of busy cpus in this group. |
742 | */ | 744 | */ |
@@ -750,7 +752,7 @@ struct sched_group { | |||
750 | atomic_t ref; | 752 | atomic_t ref; |
751 | 753 | ||
752 | unsigned int group_weight; | 754 | unsigned int group_weight; |
753 | struct sched_group_power *sgp; | 755 | struct sched_group_capacity *sgc; |
754 | 756 | ||
755 | /* | 757 | /* |
756 | * The CPUs this group covers. | 758 | * The CPUs this group covers. |
@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | |||
773 | */ | 775 | */ |
774 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | 776 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) |
775 | { | 777 | { |
776 | return to_cpumask(sg->sgp->cpumask); | 778 | return to_cpumask(sg->sgc->cpumask); |
777 | } | 779 | } |
778 | 780 | ||
779 | /** | 781 | /** |
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) | |||
787 | 789 | ||
788 | extern int group_balance_cpu(struct sched_group *sg); | 790 | extern int group_balance_cpu(struct sched_group *sg); |
789 | 791 | ||
792 | #else | ||
793 | |||
794 | static inline void sched_ttwu_pending(void) { } | ||
795 | |||
790 | #endif /* CONFIG_SMP */ | 796 | #endif /* CONFIG_SMP */ |
791 | 797 | ||
792 | #include "stats.h" | 798 | #include "stats.h" |
@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class; | |||
1167 | 1173 | ||
1168 | #ifdef CONFIG_SMP | 1174 | #ifdef CONFIG_SMP |
1169 | 1175 | ||
1170 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1176 | extern void update_group_capacity(struct sched_domain *sd, int cpu); |
1171 | 1177 | ||
1172 | extern void trigger_load_balance(struct rq *rq); | 1178 | extern void trigger_load_balance(struct rq *rq); |
1173 | 1179 | ||