diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 1232 |
1 files changed, 832 insertions, 400 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..e27a53685ed9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -64,7 +64,6 @@ | |||
| 64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
| 65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
| 66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
| 67 | #include <linux/reciprocal_div.h> | ||
| 68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
| 69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
| 70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
| @@ -120,30 +119,8 @@ | |||
| 120 | */ | 119 | */ |
| 121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
| 122 | 121 | ||
| 123 | #ifdef CONFIG_SMP | ||
| 124 | |||
| 125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 122 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
| 126 | 123 | ||
| 127 | /* | ||
| 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
| 129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
| 130 | */ | ||
| 131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
| 132 | { | ||
| 133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
| 134 | } | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Each time a sched group cpu_power is changed, | ||
| 138 | * we must compute its reciprocal value | ||
| 139 | */ | ||
| 140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
| 141 | { | ||
| 142 | sg->__cpu_power += val; | ||
| 143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
| 144 | } | ||
| 145 | #endif | ||
| 146 | |||
| 147 | static inline int rt_policy(int policy) | 124 | static inline int rt_policy(int policy) |
| 148 | { | 125 | { |
| 149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 126 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
| @@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) | |||
| 309 | 286 | ||
| 310 | /* | 287 | /* |
| 311 | * Root task group. | 288 | * Root task group. |
| 312 | * Every UID task group (including init_task_group aka UID-0) will | 289 | * Every UID task group (including init_task_group aka UID-0) will |
| 313 | * be a child to this group. | 290 | * be a child to this group. |
| 314 | */ | 291 | */ |
| 315 | struct task_group root_task_group; | 292 | struct task_group root_task_group; |
| 316 | 293 | ||
| @@ -318,7 +295,7 @@ struct task_group root_task_group; | |||
| 318 | /* Default task group's sched entity on each cpu */ | 295 | /* Default task group's sched entity on each cpu */ |
| 319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 296 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 320 | /* Default task group's cfs_rq on each cpu */ | 297 | /* Default task group's cfs_rq on each cpu */ |
| 321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 298 | static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; |
| 322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 299 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 323 | 300 | ||
| 324 | #ifdef CONFIG_RT_GROUP_SCHED | 301 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -616,6 +593,7 @@ struct rq { | |||
| 616 | 593 | ||
| 617 | unsigned char idle_at_tick; | 594 | unsigned char idle_at_tick; |
| 618 | /* For active balancing */ | 595 | /* For active balancing */ |
| 596 | int post_schedule; | ||
| 619 | int active_balance; | 597 | int active_balance; |
| 620 | int push_cpu; | 598 | int push_cpu; |
| 621 | /* cpu of this runqueue: */ | 599 | /* cpu of this runqueue: */ |
| @@ -626,6 +604,9 @@ struct rq { | |||
| 626 | 604 | ||
| 627 | struct task_struct *migration_thread; | 605 | struct task_struct *migration_thread; |
| 628 | struct list_head migration_queue; | 606 | struct list_head migration_queue; |
| 607 | |||
| 608 | u64 rt_avg; | ||
| 609 | u64 age_stamp; | ||
| 629 | #endif | 610 | #endif |
| 630 | 611 | ||
| 631 | /* calc_load related fields */ | 612 | /* calc_load related fields */ |
| @@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 693 | #define this_rq() (&__get_cpu_var(runqueues)) | 674 | #define this_rq() (&__get_cpu_var(runqueues)) |
| 694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 675 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 676 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 677 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
| 696 | 678 | ||
| 697 | inline void update_rq_clock(struct rq *rq) | 679 | inline void update_rq_clock(struct rq *rq) |
| 698 | { | 680 | { |
| @@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
| 861 | unsigned int sysctl_sched_shares_thresh = 4; | 843 | unsigned int sysctl_sched_shares_thresh = 4; |
| 862 | 844 | ||
| 863 | /* | 845 | /* |
| 846 | * period over which we average the RT time consumption, measured | ||
| 847 | * in ms. | ||
| 848 | * | ||
| 849 | * default: 1s | ||
| 850 | */ | ||
| 851 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
| 852 | |||
| 853 | /* | ||
| 864 | * period over which we measure -rt task cpu usage in us. | 854 | * period over which we measure -rt task cpu usage in us. |
| 865 | * default: 1s | 855 | * default: 1s |
| 866 | */ | 856 | */ |
| @@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) | |||
| 1278 | } | 1268 | } |
| 1279 | #endif /* CONFIG_NO_HZ */ | 1269 | #endif /* CONFIG_NO_HZ */ |
| 1280 | 1270 | ||
| 1271 | static u64 sched_avg_period(void) | ||
| 1272 | { | ||
| 1273 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
| 1274 | } | ||
| 1275 | |||
| 1276 | static void sched_avg_update(struct rq *rq) | ||
| 1277 | { | ||
| 1278 | s64 period = sched_avg_period(); | ||
| 1279 | |||
| 1280 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
| 1281 | rq->age_stamp += period; | ||
| 1282 | rq->rt_avg /= 2; | ||
| 1283 | } | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1287 | { | ||
| 1288 | rq->rt_avg += rt_delta; | ||
| 1289 | sched_avg_update(rq); | ||
| 1290 | } | ||
| 1291 | |||
| 1281 | #else /* !CONFIG_SMP */ | 1292 | #else /* !CONFIG_SMP */ |
| 1282 | static void resched_task(struct task_struct *p) | 1293 | static void resched_task(struct task_struct *p) |
| 1283 | { | 1294 | { |
| 1284 | assert_spin_locked(&task_rq(p)->lock); | 1295 | assert_spin_locked(&task_rq(p)->lock); |
| 1285 | set_tsk_need_resched(p); | 1296 | set_tsk_need_resched(p); |
| 1286 | } | 1297 | } |
| 1298 | |||
| 1299 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1300 | { | ||
| 1301 | } | ||
| 1287 | #endif /* CONFIG_SMP */ | 1302 | #endif /* CONFIG_SMP */ |
| 1288 | 1303 | ||
| 1289 | #if BITS_PER_LONG == 32 | 1304 | #if BITS_PER_LONG == 32 |
| @@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1513 | 1528 | ||
| 1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1529 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1515 | 1530 | ||
| 1531 | struct update_shares_data { | ||
| 1532 | unsigned long rq_weight[NR_CPUS]; | ||
| 1533 | }; | ||
| 1534 | |||
| 1535 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
| 1536 | |||
| 1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1537 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1517 | 1538 | ||
| 1518 | /* | 1539 | /* |
| 1519 | * Calculate and set the cpu's group shares. | 1540 | * Calculate and set the cpu's group shares. |
| 1520 | */ | 1541 | */ |
| 1521 | static void | 1542 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
| 1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1543 | unsigned long sd_shares, |
| 1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1544 | unsigned long sd_rq_weight, |
| 1545 | struct update_shares_data *usd) | ||
| 1524 | { | 1546 | { |
| 1525 | unsigned long shares; | 1547 | unsigned long shares, rq_weight; |
| 1526 | unsigned long rq_weight; | 1548 | int boost = 0; |
| 1527 | |||
| 1528 | if (!tg->se[cpu]) | ||
| 1529 | return; | ||
| 1530 | 1549 | ||
| 1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1550 | rq_weight = usd->rq_weight[cpu]; |
| 1551 | if (!rq_weight) { | ||
| 1552 | boost = 1; | ||
| 1553 | rq_weight = NICE_0_LOAD; | ||
| 1554 | } | ||
| 1532 | 1555 | ||
| 1533 | /* | 1556 | /* |
| 1534 | * \Sum shares * rq_weight | 1557 | * \Sum_j shares_j * rq_weight_i |
| 1535 | * shares = ----------------------- | 1558 | * shares_i = ----------------------------- |
| 1536 | * \Sum rq_weight | 1559 | * \Sum_j rq_weight_j |
| 1537 | * | ||
| 1538 | */ | 1560 | */ |
| 1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1561 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
| 1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1562 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
| @@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1545 | unsigned long flags; | 1567 | unsigned long flags; |
| 1546 | 1568 | ||
| 1547 | spin_lock_irqsave(&rq->lock, flags); | 1569 | spin_lock_irqsave(&rq->lock, flags); |
| 1548 | tg->cfs_rq[cpu]->shares = shares; | 1570 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
| 1549 | 1571 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
| 1550 | __set_se_shares(tg->se[cpu], shares); | 1572 | __set_se_shares(tg->se[cpu], shares); |
| 1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1573 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1552 | } | 1574 | } |
| @@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1559 | */ | 1581 | */ |
| 1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1582 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1561 | { | 1583 | { |
| 1562 | unsigned long weight, rq_weight = 0; | 1584 | unsigned long weight, rq_weight = 0, shares = 0; |
| 1563 | unsigned long shares = 0; | 1585 | struct update_shares_data *usd; |
| 1564 | struct sched_domain *sd = data; | 1586 | struct sched_domain *sd = data; |
| 1587 | unsigned long flags; | ||
| 1565 | int i; | 1588 | int i; |
| 1566 | 1589 | ||
| 1590 | if (!tg->se[0]) | ||
| 1591 | return 0; | ||
| 1592 | |||
| 1593 | local_irq_save(flags); | ||
| 1594 | usd = &__get_cpu_var(update_shares_data); | ||
| 1595 | |||
| 1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1596 | for_each_cpu(i, sched_domain_span(sd)) { |
| 1597 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1598 | usd->rq_weight[i] = weight; | ||
| 1599 | |||
| 1568 | /* | 1600 | /* |
| 1569 | * If there are currently no tasks on the cpu pretend there | 1601 | * If there are currently no tasks on the cpu pretend there |
| 1570 | * is one of average load so that when a new task gets to | 1602 | * is one of average load so that when a new task gets to |
| 1571 | * run here it will not get delayed by group starvation. | 1603 | * run here it will not get delayed by group starvation. |
| 1572 | */ | 1604 | */ |
| 1573 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1574 | if (!weight) | 1605 | if (!weight) |
| 1575 | weight = NICE_0_LOAD; | 1606 | weight = NICE_0_LOAD; |
| 1576 | 1607 | ||
| 1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
| 1578 | rq_weight += weight; | 1608 | rq_weight += weight; |
| 1579 | shares += tg->cfs_rq[i]->shares; | 1609 | shares += tg->cfs_rq[i]->shares; |
| 1580 | } | 1610 | } |
| @@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1586 | shares = tg->shares; | 1616 | shares = tg->shares; |
| 1587 | 1617 | ||
| 1588 | for_each_cpu(i, sched_domain_span(sd)) | 1618 | for_each_cpu(i, sched_domain_span(sd)) |
| 1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1619 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
| 1620 | |||
| 1621 | local_irq_restore(flags); | ||
| 1590 | 1622 | ||
| 1591 | return 0; | 1623 | return 0; |
| 1592 | } | 1624 | } |
| @@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 1616 | 1648 | ||
| 1617 | static void update_shares(struct sched_domain *sd) | 1649 | static void update_shares(struct sched_domain *sd) |
| 1618 | { | 1650 | { |
| 1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1651 | s64 elapsed; |
| 1620 | s64 elapsed = now - sd->last_update; | 1652 | u64 now; |
| 1653 | |||
| 1654 | if (root_task_group_empty()) | ||
| 1655 | return; | ||
| 1656 | |||
| 1657 | now = cpu_clock(raw_smp_processor_id()); | ||
| 1658 | elapsed = now - sd->last_update; | ||
| 1621 | 1659 | ||
| 1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1660 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| 1623 | sd->last_update = now; | 1661 | sd->last_update = now; |
| @@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) | |||
| 1627 | 1665 | ||
| 1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1666 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
| 1629 | { | 1667 | { |
| 1668 | if (root_task_group_empty()) | ||
| 1669 | return; | ||
| 1670 | |||
| 1630 | spin_unlock(&rq->lock); | 1671 | spin_unlock(&rq->lock); |
| 1631 | update_shares(sd); | 1672 | update_shares(sd); |
| 1632 | spin_lock(&rq->lock); | 1673 | spin_lock(&rq->lock); |
| @@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1634 | 1675 | ||
| 1635 | static void update_h_load(long cpu) | 1676 | static void update_h_load(long cpu) |
| 1636 | { | 1677 | { |
| 1678 | if (root_task_group_empty()) | ||
| 1679 | return; | ||
| 1680 | |||
| 1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1681 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1638 | } | 1682 | } |
| 1639 | 1683 | ||
| @@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 2268 | } | 2312 | } |
| 2269 | 2313 | ||
| 2270 | /* Adjust by relative CPU power of the group */ | 2314 | /* Adjust by relative CPU power of the group */ |
| 2271 | avg_load = sg_div_cpu_power(group, | 2315 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 2272 | avg_load * SCHED_LOAD_SCALE); | ||
| 2273 | 2316 | ||
| 2274 | if (local_group) { | 2317 | if (local_group) { |
| 2275 | this_load = avg_load; | 2318 | this_load = avg_load; |
| @@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2637 | set_task_cpu(p, cpu); | 2680 | set_task_cpu(p, cpu); |
| 2638 | 2681 | ||
| 2639 | /* | 2682 | /* |
| 2640 | * Make sure we do not leak PI boosting priority to the child: | 2683 | * Make sure we do not leak PI boosting priority to the child. |
| 2641 | */ | 2684 | */ |
| 2642 | p->prio = current->normal_prio; | 2685 | p->prio = current->normal_prio; |
| 2686 | |||
| 2687 | /* | ||
| 2688 | * Revert to default priority/policy on fork if requested. | ||
| 2689 | */ | ||
| 2690 | if (unlikely(p->sched_reset_on_fork)) { | ||
| 2691 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
| 2692 | p->policy = SCHED_NORMAL; | ||
| 2693 | |||
| 2694 | if (p->normal_prio < DEFAULT_PRIO) | ||
| 2695 | p->prio = DEFAULT_PRIO; | ||
| 2696 | |||
| 2697 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
| 2698 | p->static_prio = NICE_TO_PRIO(0); | ||
| 2699 | set_load_weight(p); | ||
| 2700 | } | ||
| 2701 | |||
| 2702 | /* | ||
| 2703 | * We don't need the reset flag anymore after the fork. It has | ||
| 2704 | * fulfilled its duty: | ||
| 2705 | */ | ||
| 2706 | p->sched_reset_on_fork = 0; | ||
| 2707 | } | ||
| 2708 | |||
| 2643 | if (!rt_prio(p->prio)) | 2709 | if (!rt_prio(p->prio)) |
| 2644 | p->sched_class = &fair_sched_class; | 2710 | p->sched_class = &fair_sched_class; |
| 2645 | 2711 | ||
| @@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2796 | { | 2862 | { |
| 2797 | struct mm_struct *mm = rq->prev_mm; | 2863 | struct mm_struct *mm = rq->prev_mm; |
| 2798 | long prev_state; | 2864 | long prev_state; |
| 2799 | #ifdef CONFIG_SMP | ||
| 2800 | int post_schedule = 0; | ||
| 2801 | |||
| 2802 | if (current->sched_class->needs_post_schedule) | ||
| 2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
| 2804 | #endif | ||
| 2805 | 2865 | ||
| 2806 | rq->prev_mm = NULL; | 2866 | rq->prev_mm = NULL; |
| 2807 | 2867 | ||
| @@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2820 | finish_arch_switch(prev); | 2880 | finish_arch_switch(prev); |
| 2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2881 | perf_counter_task_sched_in(current, cpu_of(rq)); |
| 2822 | finish_lock_switch(rq, prev); | 2882 | finish_lock_switch(rq, prev); |
| 2823 | #ifdef CONFIG_SMP | ||
| 2824 | if (post_schedule) | ||
| 2825 | current->sched_class->post_schedule(rq); | ||
| 2826 | #endif | ||
| 2827 | 2883 | ||
| 2828 | fire_sched_in_preempt_notifiers(current); | 2884 | fire_sched_in_preempt_notifiers(current); |
| 2829 | if (mm) | 2885 | if (mm) |
| @@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2838 | } | 2894 | } |
| 2839 | } | 2895 | } |
| 2840 | 2896 | ||
| 2897 | #ifdef CONFIG_SMP | ||
| 2898 | |||
| 2899 | /* assumes rq->lock is held */ | ||
| 2900 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
| 2901 | { | ||
| 2902 | if (prev->sched_class->pre_schedule) | ||
| 2903 | prev->sched_class->pre_schedule(rq, prev); | ||
| 2904 | } | ||
| 2905 | |||
| 2906 | /* rq->lock is NOT held, but preemption is disabled */ | ||
| 2907 | static inline void post_schedule(struct rq *rq) | ||
| 2908 | { | ||
| 2909 | if (rq->post_schedule) { | ||
| 2910 | unsigned long flags; | ||
| 2911 | |||
| 2912 | spin_lock_irqsave(&rq->lock, flags); | ||
| 2913 | if (rq->curr->sched_class->post_schedule) | ||
| 2914 | rq->curr->sched_class->post_schedule(rq); | ||
| 2915 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 2916 | |||
| 2917 | rq->post_schedule = 0; | ||
| 2918 | } | ||
| 2919 | } | ||
| 2920 | |||
| 2921 | #else | ||
| 2922 | |||
| 2923 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
| 2924 | { | ||
| 2925 | } | ||
| 2926 | |||
| 2927 | static inline void post_schedule(struct rq *rq) | ||
| 2928 | { | ||
| 2929 | } | ||
| 2930 | |||
| 2931 | #endif | ||
| 2932 | |||
| 2841 | /** | 2933 | /** |
| 2842 | * schedule_tail - first thing a freshly forked thread must call. | 2934 | * schedule_tail - first thing a freshly forked thread must call. |
| 2843 | * @prev: the thread we just switched away from. | 2935 | * @prev: the thread we just switched away from. |
| @@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
| 2848 | struct rq *rq = this_rq(); | 2940 | struct rq *rq = this_rq(); |
| 2849 | 2941 | ||
| 2850 | finish_task_switch(rq, prev); | 2942 | finish_task_switch(rq, prev); |
| 2943 | |||
| 2944 | /* | ||
| 2945 | * FIXME: do we need to worry about rq being invalidated by the | ||
| 2946 | * task_switch? | ||
| 2947 | */ | ||
| 2948 | post_schedule(rq); | ||
| 2949 | |||
| 2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2950 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
| 2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2951 | /* In this case, finish_task_switch does not reenable preemption */ |
| 2853 | preempt_enable(); | 2952 | preempt_enable(); |
| @@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 3379 | { | 3478 | { |
| 3380 | const struct sched_class *class; | 3479 | const struct sched_class *class; |
| 3381 | 3480 | ||
| 3382 | for (class = sched_class_highest; class; class = class->next) | 3481 | for_each_class(class) { |
| 3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3482 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
| 3384 | return 1; | 3483 | return 1; |
| 3484 | } | ||
| 3385 | 3485 | ||
| 3386 | return 0; | 3486 | return 0; |
| 3387 | } | 3487 | } |
| @@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
| 3544 | * capacity but still has some space to pick up some load | 3644 | * capacity but still has some space to pick up some load |
| 3545 | * from other group and save more power | 3645 | * from other group and save more power |
| 3546 | */ | 3646 | */ |
| 3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3647 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
| 3548 | return; | 3648 | return; |
| 3549 | 3649 | ||
| 3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3650 | if (sgs->sum_nr_running > sds->leader_nr_running || |
| @@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3611 | } | 3711 | } |
| 3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3712 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 3613 | 3713 | ||
| 3714 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3715 | { | ||
| 3716 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3717 | unsigned long smt_gain = sd->smt_gain; | ||
| 3718 | |||
| 3719 | smt_gain /= weight; | ||
| 3720 | |||
| 3721 | return smt_gain; | ||
| 3722 | } | ||
| 3723 | |||
| 3724 | unsigned long scale_rt_power(int cpu) | ||
| 3725 | { | ||
| 3726 | struct rq *rq = cpu_rq(cpu); | ||
| 3727 | u64 total, available; | ||
| 3728 | |||
| 3729 | sched_avg_update(rq); | ||
| 3730 | |||
| 3731 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
| 3732 | available = total - rq->rt_avg; | ||
| 3733 | |||
| 3734 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
| 3735 | total = SCHED_LOAD_SCALE; | ||
| 3736 | |||
| 3737 | total >>= SCHED_LOAD_SHIFT; | ||
| 3738 | |||
| 3739 | return div_u64(available, total); | ||
| 3740 | } | ||
| 3741 | |||
| 3742 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
| 3743 | { | ||
| 3744 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3745 | unsigned long power = SCHED_LOAD_SCALE; | ||
| 3746 | struct sched_group *sdg = sd->groups; | ||
| 3747 | |||
| 3748 | /* here we could scale based on cpufreq */ | ||
| 3749 | |||
| 3750 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 3751 | power *= arch_scale_smt_power(sd, cpu); | ||
| 3752 | power >>= SCHED_LOAD_SHIFT; | ||
| 3753 | } | ||
| 3754 | |||
| 3755 | power *= scale_rt_power(cpu); | ||
| 3756 | power >>= SCHED_LOAD_SHIFT; | ||
| 3757 | |||
| 3758 | if (!power) | ||
| 3759 | power = 1; | ||
| 3760 | |||
| 3761 | sdg->cpu_power = power; | ||
| 3762 | } | ||
| 3763 | |||
| 3764 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
| 3765 | { | ||
| 3766 | struct sched_domain *child = sd->child; | ||
| 3767 | struct sched_group *group, *sdg = sd->groups; | ||
| 3768 | unsigned long power; | ||
| 3769 | |||
| 3770 | if (!child) { | ||
| 3771 | update_cpu_power(sd, cpu); | ||
| 3772 | return; | ||
| 3773 | } | ||
| 3774 | |||
| 3775 | power = 0; | ||
| 3776 | |||
| 3777 | group = child->groups; | ||
| 3778 | do { | ||
| 3779 | power += group->cpu_power; | ||
| 3780 | group = group->next; | ||
| 3781 | } while (group != child->groups); | ||
| 3782 | |||
| 3783 | sdg->cpu_power = power; | ||
| 3784 | } | ||
| 3614 | 3785 | ||
| 3615 | /** | 3786 | /** |
| 3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3787 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| @@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3624 | * @balance: Should we balance. | 3795 | * @balance: Should we balance. |
| 3625 | * @sgs: variable to hold the statistics for this group. | 3796 | * @sgs: variable to hold the statistics for this group. |
| 3626 | */ | 3797 | */ |
| 3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3798 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
| 3799 | struct sched_group *group, int this_cpu, | ||
| 3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3800 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
| 3629 | int local_group, const struct cpumask *cpus, | 3801 | int local_group, const struct cpumask *cpus, |
| 3630 | int *balance, struct sg_lb_stats *sgs) | 3802 | int *balance, struct sg_lb_stats *sgs) |
| @@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3635 | unsigned long sum_avg_load_per_task; | 3807 | unsigned long sum_avg_load_per_task; |
| 3636 | unsigned long avg_load_per_task; | 3808 | unsigned long avg_load_per_task; |
| 3637 | 3809 | ||
| 3638 | if (local_group) | 3810 | if (local_group) { |
| 3639 | balance_cpu = group_first_cpu(group); | 3811 | balance_cpu = group_first_cpu(group); |
| 3812 | if (balance_cpu == this_cpu) | ||
| 3813 | update_group_power(sd, this_cpu); | ||
| 3814 | } | ||
| 3640 | 3815 | ||
| 3641 | /* Tally up the load of all CPUs in the group */ | 3816 | /* Tally up the load of all CPUs in the group */ |
| 3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3817 | sum_avg_load_per_task = avg_load_per_task = 0; |
| @@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3685 | } | 3860 | } |
| 3686 | 3861 | ||
| 3687 | /* Adjust by relative CPU power of the group */ | 3862 | /* Adjust by relative CPU power of the group */ |
| 3688 | sgs->avg_load = sg_div_cpu_power(group, | 3863 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
| 3690 | 3864 | ||
| 3691 | 3865 | ||
| 3692 | /* | 3866 | /* |
| @@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3698 | * normalized nr_running number somewhere that negates | 3872 | * normalized nr_running number somewhere that negates |
| 3699 | * the hierarchy? | 3873 | * the hierarchy? |
| 3700 | */ | 3874 | */ |
| 3701 | avg_load_per_task = sg_div_cpu_power(group, | 3875 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
| 3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3876 | group->cpu_power; |
| 3703 | 3877 | ||
| 3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3878 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
| 3705 | sgs->group_imb = 1; | 3879 | sgs->group_imb = 1; |
| 3706 | 3880 | ||
| 3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3881 | sgs->group_capacity = |
| 3708 | 3882 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
| 3709 | } | 3883 | } |
| 3710 | 3884 | ||
| 3711 | /** | 3885 | /** |
| @@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3723 | const struct cpumask *cpus, int *balance, | 3897 | const struct cpumask *cpus, int *balance, |
| 3724 | struct sd_lb_stats *sds) | 3898 | struct sd_lb_stats *sds) |
| 3725 | { | 3899 | { |
| 3900 | struct sched_domain *child = sd->child; | ||
| 3726 | struct sched_group *group = sd->groups; | 3901 | struct sched_group *group = sd->groups; |
| 3727 | struct sg_lb_stats sgs; | 3902 | struct sg_lb_stats sgs; |
| 3728 | int load_idx; | 3903 | int load_idx, prefer_sibling = 0; |
| 3904 | |||
| 3905 | if (child && child->flags & SD_PREFER_SIBLING) | ||
| 3906 | prefer_sibling = 1; | ||
| 3729 | 3907 | ||
| 3730 | init_sd_power_savings_stats(sd, sds, idle); | 3908 | init_sd_power_savings_stats(sd, sds, idle); |
| 3731 | load_idx = get_sd_load_idx(sd, idle); | 3909 | load_idx = get_sd_load_idx(sd, idle); |
| @@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3736 | local_group = cpumask_test_cpu(this_cpu, | 3914 | local_group = cpumask_test_cpu(this_cpu, |
| 3737 | sched_group_cpus(group)); | 3915 | sched_group_cpus(group)); |
| 3738 | memset(&sgs, 0, sizeof(sgs)); | 3916 | memset(&sgs, 0, sizeof(sgs)); |
| 3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3917 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
| 3740 | local_group, cpus, balance, &sgs); | 3918 | local_group, cpus, balance, &sgs); |
| 3741 | 3919 | ||
| 3742 | if (local_group && balance && !(*balance)) | 3920 | if (local_group && balance && !(*balance)) |
| 3743 | return; | 3921 | return; |
| 3744 | 3922 | ||
| 3745 | sds->total_load += sgs.group_load; | 3923 | sds->total_load += sgs.group_load; |
| 3746 | sds->total_pwr += group->__cpu_power; | 3924 | sds->total_pwr += group->cpu_power; |
| 3925 | |||
| 3926 | /* | ||
| 3927 | * In case the child domain prefers tasks go to siblings | ||
| 3928 | * first, lower the group capacity to one so that we'll try | ||
| 3929 | * and move all the excess tasks away. | ||
| 3930 | */ | ||
| 3931 | if (prefer_sibling) | ||
| 3932 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
| 3747 | 3933 | ||
| 3748 | if (local_group) { | 3934 | if (local_group) { |
| 3749 | sds->this_load = sgs.avg_load; | 3935 | sds->this_load = sgs.avg_load; |
| @@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3949 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
| 3764 | group = group->next; | 3950 | group = group->next; |
| 3765 | } while (group != sd->groups); | 3951 | } while (group != sd->groups); |
| 3766 | |||
| 3767 | } | 3952 | } |
| 3768 | 3953 | ||
| 3769 | /** | 3954 | /** |
| @@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
| 3801 | * moving them. | 3986 | * moving them. |
| 3802 | */ | 3987 | */ |
| 3803 | 3988 | ||
| 3804 | pwr_now += sds->busiest->__cpu_power * | 3989 | pwr_now += sds->busiest->cpu_power * |
| 3805 | min(sds->busiest_load_per_task, sds->max_load); | 3990 | min(sds->busiest_load_per_task, sds->max_load); |
| 3806 | pwr_now += sds->this->__cpu_power * | 3991 | pwr_now += sds->this->cpu_power * |
| 3807 | min(sds->this_load_per_task, sds->this_load); | 3992 | min(sds->this_load_per_task, sds->this_load); |
| 3808 | pwr_now /= SCHED_LOAD_SCALE; | 3993 | pwr_now /= SCHED_LOAD_SCALE; |
| 3809 | 3994 | ||
| 3810 | /* Amount of load we'd subtract */ | 3995 | /* Amount of load we'd subtract */ |
| 3811 | tmp = sg_div_cpu_power(sds->busiest, | 3996 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3997 | sds->busiest->cpu_power; |
| 3813 | if (sds->max_load > tmp) | 3998 | if (sds->max_load > tmp) |
| 3814 | pwr_move += sds->busiest->__cpu_power * | 3999 | pwr_move += sds->busiest->cpu_power * |
| 3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4000 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
| 3816 | 4001 | ||
| 3817 | /* Amount of load we'd add */ | 4002 | /* Amount of load we'd add */ |
| 3818 | if (sds->max_load * sds->busiest->__cpu_power < | 4003 | if (sds->max_load * sds->busiest->cpu_power < |
| 3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 4004 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
| 3820 | tmp = sg_div_cpu_power(sds->this, | 4005 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
| 3821 | sds->max_load * sds->busiest->__cpu_power); | 4006 | sds->this->cpu_power; |
| 3822 | else | 4007 | else |
| 3823 | tmp = sg_div_cpu_power(sds->this, | 4008 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 4009 | sds->this->cpu_power; |
| 3825 | pwr_move += sds->this->__cpu_power * | 4010 | pwr_move += sds->this->cpu_power * |
| 3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 4011 | min(sds->this_load_per_task, sds->this_load + tmp); |
| 3827 | pwr_move /= SCHED_LOAD_SCALE; | 4012 | pwr_move /= SCHED_LOAD_SCALE; |
| 3828 | 4013 | ||
| @@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3857 | sds->max_load - sds->busiest_load_per_task); | 4042 | sds->max_load - sds->busiest_load_per_task); |
| 3858 | 4043 | ||
| 3859 | /* How much load to actually move to equalise the imbalance */ | 4044 | /* How much load to actually move to equalise the imbalance */ |
| 3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 4045 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
| 3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 4046 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
| 3862 | / SCHED_LOAD_SCALE; | 4047 | / SCHED_LOAD_SCALE; |
| 3863 | 4048 | ||
| 3864 | /* | 4049 | /* |
| @@ -3976,6 +4161,26 @@ ret: | |||
| 3976 | return NULL; | 4161 | return NULL; |
| 3977 | } | 4162 | } |
| 3978 | 4163 | ||
| 4164 | static struct sched_group *group_of(int cpu) | ||
| 4165 | { | ||
| 4166 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
| 4167 | |||
| 4168 | if (!sd) | ||
| 4169 | return NULL; | ||
| 4170 | |||
| 4171 | return sd->groups; | ||
| 4172 | } | ||
| 4173 | |||
| 4174 | static unsigned long power_of(int cpu) | ||
| 4175 | { | ||
| 4176 | struct sched_group *group = group_of(cpu); | ||
| 4177 | |||
| 4178 | if (!group) | ||
| 4179 | return SCHED_LOAD_SCALE; | ||
| 4180 | |||
| 4181 | return group->cpu_power; | ||
| 4182 | } | ||
| 4183 | |||
| 3979 | /* | 4184 | /* |
| 3980 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4185 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
| 3981 | */ | 4186 | */ |
| @@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 3988 | int i; | 4193 | int i; |
| 3989 | 4194 | ||
| 3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4195 | for_each_cpu(i, sched_group_cpus(group)) { |
| 4196 | unsigned long power = power_of(i); | ||
| 4197 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
| 3991 | unsigned long wl; | 4198 | unsigned long wl; |
| 3992 | 4199 | ||
| 3993 | if (!cpumask_test_cpu(i, cpus)) | 4200 | if (!cpumask_test_cpu(i, cpus)) |
| 3994 | continue; | 4201 | continue; |
| 3995 | 4202 | ||
| 3996 | rq = cpu_rq(i); | 4203 | rq = cpu_rq(i); |
| 3997 | wl = weighted_cpuload(i); | 4204 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
| 4205 | wl /= power; | ||
| 3998 | 4206 | ||
| 3999 | if (rq->nr_running == 1 && wl > imbalance) | 4207 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
| 4000 | continue; | 4208 | continue; |
| 4001 | 4209 | ||
| 4002 | if (wl > max_load) { | 4210 | if (wl > max_load) { |
| @@ -5325,7 +5533,7 @@ need_resched: | |||
| 5325 | preempt_disable(); | 5533 | preempt_disable(); |
| 5326 | cpu = smp_processor_id(); | 5534 | cpu = smp_processor_id(); |
| 5327 | rq = cpu_rq(cpu); | 5535 | rq = cpu_rq(cpu); |
| 5328 | rcu_qsctr_inc(cpu); | 5536 | rcu_sched_qs(cpu); |
| 5329 | prev = rq->curr; | 5537 | prev = rq->curr; |
| 5330 | switch_count = &prev->nivcsw; | 5538 | switch_count = &prev->nivcsw; |
| 5331 | 5539 | ||
| @@ -5349,10 +5557,7 @@ need_resched_nonpreemptible: | |||
| 5349 | switch_count = &prev->nvcsw; | 5557 | switch_count = &prev->nvcsw; |
| 5350 | } | 5558 | } |
| 5351 | 5559 | ||
| 5352 | #ifdef CONFIG_SMP | 5560 | pre_schedule(rq, prev); |
| 5353 | if (prev->sched_class->pre_schedule) | ||
| 5354 | prev->sched_class->pre_schedule(rq, prev); | ||
| 5355 | #endif | ||
| 5356 | 5561 | ||
| 5357 | if (unlikely(!rq->nr_running)) | 5562 | if (unlikely(!rq->nr_running)) |
| 5358 | idle_balance(cpu, rq); | 5563 | idle_balance(cpu, rq); |
| @@ -5378,6 +5583,8 @@ need_resched_nonpreemptible: | |||
| 5378 | } else | 5583 | } else |
| 5379 | spin_unlock_irq(&rq->lock); | 5584 | spin_unlock_irq(&rq->lock); |
| 5380 | 5585 | ||
| 5586 | post_schedule(rq); | ||
| 5587 | |||
| 5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5588 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 5382 | goto need_resched_nonpreemptible; | 5589 | goto need_resched_nonpreemptible; |
| 5383 | 5590 | ||
| @@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
| 6123 | unsigned long flags; | 6330 | unsigned long flags; |
| 6124 | const struct sched_class *prev_class = p->sched_class; | 6331 | const struct sched_class *prev_class = p->sched_class; |
| 6125 | struct rq *rq; | 6332 | struct rq *rq; |
| 6333 | int reset_on_fork; | ||
| 6126 | 6334 | ||
| 6127 | /* may grab non-irq protected spin_locks */ | 6335 | /* may grab non-irq protected spin_locks */ |
| 6128 | BUG_ON(in_interrupt()); | 6336 | BUG_ON(in_interrupt()); |
| 6129 | recheck: | 6337 | recheck: |
| 6130 | /* double check policy once rq lock held */ | 6338 | /* double check policy once rq lock held */ |
| 6131 | if (policy < 0) | 6339 | if (policy < 0) { |
| 6340 | reset_on_fork = p->sched_reset_on_fork; | ||
| 6132 | policy = oldpolicy = p->policy; | 6341 | policy = oldpolicy = p->policy; |
| 6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6342 | } else { |
| 6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6343 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
| 6135 | policy != SCHED_IDLE) | 6344 | policy &= ~SCHED_RESET_ON_FORK; |
| 6136 | return -EINVAL; | 6345 | |
| 6346 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
| 6347 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
| 6348 | policy != SCHED_IDLE) | ||
| 6349 | return -EINVAL; | ||
| 6350 | } | ||
| 6351 | |||
| 6137 | /* | 6352 | /* |
| 6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6353 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6354 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
| @@ -6177,6 +6392,10 @@ recheck: | |||
| 6177 | /* can't change other user's priorities */ | 6392 | /* can't change other user's priorities */ |
| 6178 | if (!check_same_owner(p)) | 6393 | if (!check_same_owner(p)) |
| 6179 | return -EPERM; | 6394 | return -EPERM; |
| 6395 | |||
| 6396 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
| 6397 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
| 6398 | return -EPERM; | ||
| 6180 | } | 6399 | } |
| 6181 | 6400 | ||
| 6182 | if (user) { | 6401 | if (user) { |
| @@ -6220,6 +6439,8 @@ recheck: | |||
| 6220 | if (running) | 6439 | if (running) |
| 6221 | p->sched_class->put_prev_task(rq, p); | 6440 | p->sched_class->put_prev_task(rq, p); |
| 6222 | 6441 | ||
| 6442 | p->sched_reset_on_fork = reset_on_fork; | ||
| 6443 | |||
| 6223 | oldprio = p->prio; | 6444 | oldprio = p->prio; |
| 6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6445 | __setscheduler(rq, p, policy, param->sched_priority); |
| 6225 | 6446 | ||
| @@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 6336 | if (p) { | 6557 | if (p) { |
| 6337 | retval = security_task_getscheduler(p); | 6558 | retval = security_task_getscheduler(p); |
| 6338 | if (!retval) | 6559 | if (!retval) |
| 6339 | retval = p->policy; | 6560 | retval = p->policy |
| 6561 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
| 6340 | } | 6562 | } |
| 6341 | read_unlock(&tasklist_lock); | 6563 | read_unlock(&tasklist_lock); |
| 6342 | return retval; | 6564 | return retval; |
| 6343 | } | 6565 | } |
| 6344 | 6566 | ||
| 6345 | /** | 6567 | /** |
| 6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6568 | * sys_sched_getparam - get the RT priority of a thread |
| 6347 | * @pid: the pid in question. | 6569 | * @pid: the pid in question. |
| 6348 | * @param: structure containing the RT priority. | 6570 | * @param: structure containing the RT priority. |
| 6349 | */ | 6571 | */ |
| @@ -6571,19 +6793,9 @@ static inline int should_resched(void) | |||
| 6571 | 6793 | ||
| 6572 | static void __cond_resched(void) | 6794 | static void __cond_resched(void) |
| 6573 | { | 6795 | { |
| 6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6796 | add_preempt_count(PREEMPT_ACTIVE); |
| 6575 | __might_sleep(__FILE__, __LINE__); | 6797 | schedule(); |
| 6576 | #endif | 6798 | sub_preempt_count(PREEMPT_ACTIVE); |
| 6577 | /* | ||
| 6578 | * The BKS might be reacquired before we have dropped | ||
| 6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
| 6580 | * cond_resched() call. | ||
| 6581 | */ | ||
| 6582 | do { | ||
| 6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
| 6584 | schedule(); | ||
| 6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
| 6586 | } while (need_resched()); | ||
| 6587 | } | 6799 | } |
| 6588 | 6800 | ||
| 6589 | int __sched _cond_resched(void) | 6801 | int __sched _cond_resched(void) |
| @@ -6597,18 +6809,20 @@ int __sched _cond_resched(void) | |||
| 6597 | EXPORT_SYMBOL(_cond_resched); | 6809 | EXPORT_SYMBOL(_cond_resched); |
| 6598 | 6810 | ||
| 6599 | /* | 6811 | /* |
| 6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6812 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
| 6601 | * call schedule, and on return reacquire the lock. | 6813 | * call schedule, and on return reacquire the lock. |
| 6602 | * | 6814 | * |
| 6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6815 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
| 6604 | * operations here to prevent schedule() from being called twice (once via | 6816 | * operations here to prevent schedule() from being called twice (once via |
| 6605 | * spin_unlock(), once by hand). | 6817 | * spin_unlock(), once by hand). |
| 6606 | */ | 6818 | */ |
| 6607 | int cond_resched_lock(spinlock_t *lock) | 6819 | int __cond_resched_lock(spinlock_t *lock) |
| 6608 | { | 6820 | { |
| 6609 | int resched = should_resched(); | 6821 | int resched = should_resched(); |
| 6610 | int ret = 0; | 6822 | int ret = 0; |
| 6611 | 6823 | ||
| 6824 | lockdep_assert_held(lock); | ||
| 6825 | |||
| 6612 | if (spin_needbreak(lock) || resched) { | 6826 | if (spin_needbreak(lock) || resched) { |
| 6613 | spin_unlock(lock); | 6827 | spin_unlock(lock); |
| 6614 | if (resched) | 6828 | if (resched) |
| @@ -6620,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 6620 | } | 6834 | } |
| 6621 | return ret; | 6835 | return ret; |
| 6622 | } | 6836 | } |
| 6623 | EXPORT_SYMBOL(cond_resched_lock); | 6837 | EXPORT_SYMBOL(__cond_resched_lock); |
| 6624 | 6838 | ||
| 6625 | int __sched cond_resched_softirq(void) | 6839 | int __sched __cond_resched_softirq(void) |
| 6626 | { | 6840 | { |
| 6627 | BUG_ON(!in_softirq()); | 6841 | BUG_ON(!in_softirq()); |
| 6628 | 6842 | ||
| @@ -6634,7 +6848,7 @@ int __sched cond_resched_softirq(void) | |||
| 6634 | } | 6848 | } |
| 6635 | return 0; | 6849 | return 0; |
| 6636 | } | 6850 | } |
| 6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6851 | EXPORT_SYMBOL(__cond_resched_softirq); |
| 6638 | 6852 | ||
| 6639 | /** | 6853 | /** |
| 6640 | * yield - yield the current processor to other threads. | 6854 | * yield - yield the current processor to other threads. |
| @@ -6658,11 +6872,13 @@ EXPORT_SYMBOL(yield); | |||
| 6658 | */ | 6872 | */ |
| 6659 | void __sched io_schedule(void) | 6873 | void __sched io_schedule(void) |
| 6660 | { | 6874 | { |
| 6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6875 | struct rq *rq = raw_rq(); |
| 6662 | 6876 | ||
| 6663 | delayacct_blkio_start(); | 6877 | delayacct_blkio_start(); |
| 6664 | atomic_inc(&rq->nr_iowait); | 6878 | atomic_inc(&rq->nr_iowait); |
| 6879 | current->in_iowait = 1; | ||
| 6665 | schedule(); | 6880 | schedule(); |
| 6881 | current->in_iowait = 0; | ||
| 6666 | atomic_dec(&rq->nr_iowait); | 6882 | atomic_dec(&rq->nr_iowait); |
| 6667 | delayacct_blkio_end(); | 6883 | delayacct_blkio_end(); |
| 6668 | } | 6884 | } |
| @@ -6670,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); | |||
| 6670 | 6886 | ||
| 6671 | long __sched io_schedule_timeout(long timeout) | 6887 | long __sched io_schedule_timeout(long timeout) |
| 6672 | { | 6888 | { |
| 6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6889 | struct rq *rq = raw_rq(); |
| 6674 | long ret; | 6890 | long ret; |
| 6675 | 6891 | ||
| 6676 | delayacct_blkio_start(); | 6892 | delayacct_blkio_start(); |
| 6677 | atomic_inc(&rq->nr_iowait); | 6893 | atomic_inc(&rq->nr_iowait); |
| 6894 | current->in_iowait = 1; | ||
| 6678 | ret = schedule_timeout(timeout); | 6895 | ret = schedule_timeout(timeout); |
| 6896 | current->in_iowait = 0; | ||
| 6679 | atomic_dec(&rq->nr_iowait); | 6897 | atomic_dec(&rq->nr_iowait); |
| 6680 | delayacct_blkio_end(); | 6898 | delayacct_blkio_end(); |
| 6681 | return ret; | 6899 | return ret; |
| @@ -6992,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 6992 | 7210 | ||
| 6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7211 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
| 6994 | /* Need help from migration thread: drop lock and wait. */ | 7212 | /* Need help from migration thread: drop lock and wait. */ |
| 7213 | struct task_struct *mt = rq->migration_thread; | ||
| 7214 | |||
| 7215 | get_task_struct(mt); | ||
| 6995 | task_rq_unlock(rq, &flags); | 7216 | task_rq_unlock(rq, &flags); |
| 6996 | wake_up_process(rq->migration_thread); | 7217 | wake_up_process(rq->migration_thread); |
| 7218 | put_task_struct(mt); | ||
| 6997 | wait_for_completion(&req.done); | 7219 | wait_for_completion(&req.done); |
| 6998 | tlb_migrate_finish(p->mm); | 7220 | tlb_migrate_finish(p->mm); |
| 6999 | return 0; | 7221 | return 0; |
| @@ -7051,6 +7273,11 @@ fail: | |||
| 7051 | return ret; | 7273 | return ret; |
| 7052 | } | 7274 | } |
| 7053 | 7275 | ||
| 7276 | #define RCU_MIGRATION_IDLE 0 | ||
| 7277 | #define RCU_MIGRATION_NEED_QS 1 | ||
| 7278 | #define RCU_MIGRATION_GOT_QS 2 | ||
| 7279 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
| 7280 | |||
| 7054 | /* | 7281 | /* |
| 7055 | * migration_thread - this is a highprio system thread that performs | 7282 | * migration_thread - this is a highprio system thread that performs |
| 7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7283 | * thread migration by bumping thread off CPU then 'pushing' onto |
| @@ -7058,6 +7285,7 @@ fail: | |||
| 7058 | */ | 7285 | */ |
| 7059 | static int migration_thread(void *data) | 7286 | static int migration_thread(void *data) |
| 7060 | { | 7287 | { |
| 7288 | int badcpu; | ||
| 7061 | int cpu = (long)data; | 7289 | int cpu = (long)data; |
| 7062 | struct rq *rq; | 7290 | struct rq *rq; |
| 7063 | 7291 | ||
| @@ -7092,8 +7320,17 @@ static int migration_thread(void *data) | |||
| 7092 | req = list_entry(head->next, struct migration_req, list); | 7320 | req = list_entry(head->next, struct migration_req, list); |
| 7093 | list_del_init(head->next); | 7321 | list_del_init(head->next); |
| 7094 | 7322 | ||
| 7095 | spin_unlock(&rq->lock); | 7323 | if (req->task != NULL) { |
| 7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7324 | spin_unlock(&rq->lock); |
| 7325 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
| 7326 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
| 7327 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
| 7328 | spin_unlock(&rq->lock); | ||
| 7329 | } else { | ||
| 7330 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
| 7331 | spin_unlock(&rq->lock); | ||
| 7332 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
| 7333 | } | ||
| 7097 | local_irq_enable(); | 7334 | local_irq_enable(); |
| 7098 | 7335 | ||
| 7099 | complete(&req->done); | 7336 | complete(&req->done); |
| @@ -7625,7 +7862,7 @@ static int __init migration_init(void) | |||
| 7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7862 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 7626 | register_cpu_notifier(&migration_notifier); | 7863 | register_cpu_notifier(&migration_notifier); |
| 7627 | 7864 | ||
| 7628 | return err; | 7865 | return 0; |
| 7629 | } | 7866 | } |
| 7630 | early_initcall(migration_init); | 7867 | early_initcall(migration_init); |
| 7631 | #endif | 7868 | #endif |
| @@ -7672,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7672 | break; | 7909 | break; |
| 7673 | } | 7910 | } |
| 7674 | 7911 | ||
| 7675 | if (!group->__cpu_power) { | 7912 | if (!group->cpu_power) { |
| 7676 | printk(KERN_CONT "\n"); | 7913 | printk(KERN_CONT "\n"); |
| 7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7914 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 7678 | "set\n"); | 7915 | "set\n"); |
| @@ -7696,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7933 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
| 7697 | 7934 | ||
| 7698 | printk(KERN_CONT " %s", str); | 7935 | printk(KERN_CONT " %s", str); |
| 7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7936 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
| 7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7937 | printk(KERN_CONT " (cpu_power = %d)", |
| 7701 | group->__cpu_power); | 7938 | group->cpu_power); |
| 7702 | } | 7939 | } |
| 7703 | 7940 | ||
| 7704 | group = group->next; | 7941 | group = group->next; |
| @@ -7841,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 7841 | rq->rd = rd; | 8078 | rq->rd = rd; |
| 7842 | 8079 | ||
| 7843 | cpumask_set_cpu(rq->cpu, rd->span); | 8080 | cpumask_set_cpu(rq->cpu, rd->span); |
| 7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 8081 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
| 7845 | set_rq_online(rq); | 8082 | set_rq_online(rq); |
| 7846 | 8083 | ||
| 7847 | spin_unlock_irqrestore(&rq->lock, flags); | 8084 | spin_unlock_irqrestore(&rq->lock, flags); |
| @@ -7983,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
| 7983 | continue; | 8220 | continue; |
| 7984 | 8221 | ||
| 7985 | cpumask_clear(sched_group_cpus(sg)); | 8222 | cpumask_clear(sched_group_cpus(sg)); |
| 7986 | sg->__cpu_power = 0; | 8223 | sg->cpu_power = 0; |
| 7987 | 8224 | ||
| 7988 | for_each_cpu(j, span) { | 8225 | for_each_cpu(j, span) { |
| 7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8226 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
| @@ -8091,6 +8328,39 @@ struct static_sched_domain { | |||
| 8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8328 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
| 8092 | }; | 8329 | }; |
| 8093 | 8330 | ||
| 8331 | struct s_data { | ||
| 8332 | #ifdef CONFIG_NUMA | ||
| 8333 | int sd_allnodes; | ||
| 8334 | cpumask_var_t domainspan; | ||
| 8335 | cpumask_var_t covered; | ||
| 8336 | cpumask_var_t notcovered; | ||
| 8337 | #endif | ||
| 8338 | cpumask_var_t nodemask; | ||
| 8339 | cpumask_var_t this_sibling_map; | ||
| 8340 | cpumask_var_t this_core_map; | ||
| 8341 | cpumask_var_t send_covered; | ||
| 8342 | cpumask_var_t tmpmask; | ||
| 8343 | struct sched_group **sched_group_nodes; | ||
| 8344 | struct root_domain *rd; | ||
| 8345 | }; | ||
| 8346 | |||
| 8347 | enum s_alloc { | ||
| 8348 | sa_sched_groups = 0, | ||
| 8349 | sa_rootdomain, | ||
| 8350 | sa_tmpmask, | ||
| 8351 | sa_send_covered, | ||
| 8352 | sa_this_core_map, | ||
| 8353 | sa_this_sibling_map, | ||
| 8354 | sa_nodemask, | ||
| 8355 | sa_sched_group_nodes, | ||
| 8356 | #ifdef CONFIG_NUMA | ||
| 8357 | sa_notcovered, | ||
| 8358 | sa_covered, | ||
| 8359 | sa_domainspan, | ||
| 8360 | #endif | ||
| 8361 | sa_none, | ||
| 8362 | }; | ||
| 8363 | |||
| 8094 | /* | 8364 | /* |
| 8095 | * SMT sched-domains: | 8365 | * SMT sched-domains: |
| 8096 | */ | 8366 | */ |
| @@ -8208,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 8208 | continue; | 8478 | continue; |
| 8209 | } | 8479 | } |
| 8210 | 8480 | ||
| 8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8481 | sg->cpu_power += sd->groups->cpu_power; |
| 8212 | } | 8482 | } |
| 8213 | sg = sg->next; | 8483 | sg = sg->next; |
| 8214 | } while (sg != group_head); | 8484 | } while (sg != group_head); |
| 8215 | } | 8485 | } |
| 8486 | |||
| 8487 | static int build_numa_sched_groups(struct s_data *d, | ||
| 8488 | const struct cpumask *cpu_map, int num) | ||
| 8489 | { | ||
| 8490 | struct sched_domain *sd; | ||
| 8491 | struct sched_group *sg, *prev; | ||
| 8492 | int n, j; | ||
| 8493 | |||
| 8494 | cpumask_clear(d->covered); | ||
| 8495 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
| 8496 | if (cpumask_empty(d->nodemask)) { | ||
| 8497 | d->sched_group_nodes[num] = NULL; | ||
| 8498 | goto out; | ||
| 8499 | } | ||
| 8500 | |||
| 8501 | sched_domain_node_span(num, d->domainspan); | ||
| 8502 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
| 8503 | |||
| 8504 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8505 | GFP_KERNEL, num); | ||
| 8506 | if (!sg) { | ||
| 8507 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
| 8508 | num); | ||
| 8509 | return -ENOMEM; | ||
| 8510 | } | ||
| 8511 | d->sched_group_nodes[num] = sg; | ||
| 8512 | |||
| 8513 | for_each_cpu(j, d->nodemask) { | ||
| 8514 | sd = &per_cpu(node_domains, j).sd; | ||
| 8515 | sd->groups = sg; | ||
| 8516 | } | ||
| 8517 | |||
| 8518 | sg->cpu_power = 0; | ||
| 8519 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
| 8520 | sg->next = sg; | ||
| 8521 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
| 8522 | |||
| 8523 | prev = sg; | ||
| 8524 | for (j = 0; j < nr_node_ids; j++) { | ||
| 8525 | n = (num + j) % nr_node_ids; | ||
| 8526 | cpumask_complement(d->notcovered, d->covered); | ||
| 8527 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
| 8528 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
| 8529 | if (cpumask_empty(d->tmpmask)) | ||
| 8530 | break; | ||
| 8531 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
| 8532 | if (cpumask_empty(d->tmpmask)) | ||
| 8533 | continue; | ||
| 8534 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8535 | GFP_KERNEL, num); | ||
| 8536 | if (!sg) { | ||
| 8537 | printk(KERN_WARNING | ||
| 8538 | "Can not alloc domain group for node %d\n", j); | ||
| 8539 | return -ENOMEM; | ||
| 8540 | } | ||
| 8541 | sg->cpu_power = 0; | ||
| 8542 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
| 8543 | sg->next = prev->next; | ||
| 8544 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
| 8545 | prev->next = sg; | ||
| 8546 | prev = sg; | ||
| 8547 | } | ||
| 8548 | out: | ||
| 8549 | return 0; | ||
| 8550 | } | ||
| 8216 | #endif /* CONFIG_NUMA */ | 8551 | #endif /* CONFIG_NUMA */ |
| 8217 | 8552 | ||
| 8218 | #ifdef CONFIG_NUMA | 8553 | #ifdef CONFIG_NUMA |
| @@ -8266,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
| 8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8601 | * there are asymmetries in the topology. If there are asymmetries, group |
| 8267 | * having more cpu_power will pickup more load compared to the group having | 8602 | * having more cpu_power will pickup more load compared to the group having |
| 8268 | * less cpu_power. | 8603 | * less cpu_power. |
| 8269 | * | ||
| 8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
| 8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
| 8272 | * or lightly loaded groups in the same sched domain. | ||
| 8273 | */ | 8604 | */ |
| 8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8605 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
| 8275 | { | 8606 | { |
| 8276 | struct sched_domain *child; | 8607 | struct sched_domain *child; |
| 8277 | struct sched_group *group; | 8608 | struct sched_group *group; |
| 8609 | long power; | ||
| 8610 | int weight; | ||
| 8278 | 8611 | ||
| 8279 | WARN_ON(!sd || !sd->groups); | 8612 | WARN_ON(!sd || !sd->groups); |
| 8280 | 8613 | ||
| @@ -8283,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 8283 | 8616 | ||
| 8284 | child = sd->child; | 8617 | child = sd->child; |
| 8285 | 8618 | ||
| 8286 | sd->groups->__cpu_power = 0; | 8619 | sd->groups->cpu_power = 0; |
| 8287 | 8620 | ||
| 8288 | /* | 8621 | if (!child) { |
| 8289 | * For perf policy, if the groups in child domain share resources | 8622 | power = SCHED_LOAD_SCALE; |
| 8290 | * (for example cores sharing some portions of the cache hierarchy | 8623 | weight = cpumask_weight(sched_domain_span(sd)); |
| 8291 | * or SMT), then set this domain groups cpu_power such that each group | 8624 | /* |
| 8292 | * can handle only one task, when there are other idle groups in the | 8625 | * SMT siblings share the power of a single core. |
| 8293 | * same sched domain. | 8626 | * Usually multiple threads get a better yield out of |
| 8294 | */ | 8627 | * that one core than a single thread would have, |
| 8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8628 | * reflect that in sd->smt_gain. |
| 8296 | (child->flags & | 8629 | */ |
| 8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8630 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
| 8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8631 | power *= sd->smt_gain; |
| 8632 | power /= weight; | ||
| 8633 | power >>= SCHED_LOAD_SHIFT; | ||
| 8634 | } | ||
| 8635 | sd->groups->cpu_power += power; | ||
| 8299 | return; | 8636 | return; |
| 8300 | } | 8637 | } |
| 8301 | 8638 | ||
| 8302 | /* | 8639 | /* |
| 8303 | * add cpu_power of each child group to this groups cpu_power | 8640 | * Add cpu_power of each child group to this groups cpu_power. |
| 8304 | */ | 8641 | */ |
| 8305 | group = child->groups; | 8642 | group = child->groups; |
| 8306 | do { | 8643 | do { |
| 8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8644 | sd->groups->cpu_power += group->cpu_power; |
| 8308 | group = group->next; | 8645 | group = group->next; |
| 8309 | } while (group != child->groups); | 8646 | } while (group != child->groups); |
| 8310 | } | 8647 | } |
| @@ -8378,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
| 8378 | } | 8715 | } |
| 8379 | } | 8716 | } |
| 8380 | 8717 | ||
| 8381 | /* | 8718 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
| 8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8719 | const struct cpumask *cpu_map) |
| 8383 | * to the individual cpus | 8720 | { |
| 8384 | */ | 8721 | switch (what) { |
| 8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | 8722 | case sa_sched_groups: |
| 8386 | struct sched_domain_attr *attr) | 8723 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ |
| 8387 | { | 8724 | d->sched_group_nodes = NULL; |
| 8388 | int i, err = -ENOMEM; | 8725 | case sa_rootdomain: |
| 8389 | struct root_domain *rd; | 8726 | free_rootdomain(d->rd); /* fall through */ |
| 8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | 8727 | case sa_tmpmask: |
| 8391 | tmpmask; | 8728 | free_cpumask_var(d->tmpmask); /* fall through */ |
| 8729 | case sa_send_covered: | ||
| 8730 | free_cpumask_var(d->send_covered); /* fall through */ | ||
| 8731 | case sa_this_core_map: | ||
| 8732 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
| 8733 | case sa_this_sibling_map: | ||
| 8734 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
| 8735 | case sa_nodemask: | ||
| 8736 | free_cpumask_var(d->nodemask); /* fall through */ | ||
| 8737 | case sa_sched_group_nodes: | ||
| 8392 | #ifdef CONFIG_NUMA | 8738 | #ifdef CONFIG_NUMA |
| 8393 | cpumask_var_t domainspan, covered, notcovered; | 8739 | kfree(d->sched_group_nodes); /* fall through */ |
| 8394 | struct sched_group **sched_group_nodes = NULL; | 8740 | case sa_notcovered: |
| 8395 | int sd_allnodes = 0; | 8741 | free_cpumask_var(d->notcovered); /* fall through */ |
| 8396 | 8742 | case sa_covered: | |
| 8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8743 | free_cpumask_var(d->covered); /* fall through */ |
| 8398 | goto out; | 8744 | case sa_domainspan: |
| 8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8745 | free_cpumask_var(d->domainspan); /* fall through */ |
| 8400 | goto free_domainspan; | 8746 | #endif |
| 8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8747 | case sa_none: |
| 8402 | goto free_covered; | 8748 | break; |
| 8403 | #endif | 8749 | } |
| 8404 | 8750 | } | |
| 8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
| 8406 | goto free_notcovered; | ||
| 8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
| 8408 | goto free_nodemask; | ||
| 8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
| 8410 | goto free_this_sibling_map; | ||
| 8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
| 8412 | goto free_this_core_map; | ||
| 8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
| 8414 | goto free_send_covered; | ||
| 8415 | 8751 | ||
| 8752 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
| 8753 | const struct cpumask *cpu_map) | ||
| 8754 | { | ||
| 8416 | #ifdef CONFIG_NUMA | 8755 | #ifdef CONFIG_NUMA |
| 8417 | /* | 8756 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
| 8418 | * Allocate the per-node list of sched groups | 8757 | return sa_none; |
| 8419 | */ | 8758 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
| 8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8759 | return sa_domainspan; |
| 8421 | GFP_KERNEL); | 8760 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
| 8422 | if (!sched_group_nodes) { | 8761 | return sa_covered; |
| 8762 | /* Allocate the per-node list of sched groups */ | ||
| 8763 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
| 8764 | sizeof(struct sched_group *), GFP_KERNEL); | ||
| 8765 | if (!d->sched_group_nodes) { | ||
| 8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8766 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 8424 | goto free_tmpmask; | 8767 | return sa_notcovered; |
| 8425 | } | 8768 | } |
| 8426 | #endif | 8769 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
| 8427 | 8770 | #endif | |
| 8428 | rd = alloc_rootdomain(); | 8771 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
| 8429 | if (!rd) { | 8772 | return sa_sched_group_nodes; |
| 8773 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
| 8774 | return sa_nodemask; | ||
| 8775 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
| 8776 | return sa_this_sibling_map; | ||
| 8777 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 8778 | return sa_this_core_map; | ||
| 8779 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
| 8780 | return sa_send_covered; | ||
| 8781 | d->rd = alloc_rootdomain(); | ||
| 8782 | if (!d->rd) { | ||
| 8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8783 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
| 8431 | goto free_sched_groups; | 8784 | return sa_tmpmask; |
| 8432 | } | 8785 | } |
| 8786 | return sa_rootdomain; | ||
| 8787 | } | ||
| 8433 | 8788 | ||
| 8789 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
| 8790 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
| 8791 | { | ||
| 8792 | struct sched_domain *sd = NULL; | ||
| 8434 | #ifdef CONFIG_NUMA | 8793 | #ifdef CONFIG_NUMA |
| 8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8794 | struct sched_domain *parent; |
| 8436 | #endif | ||
| 8437 | |||
| 8438 | /* | ||
| 8439 | * Set up domains for cpus specified by the cpu_map. | ||
| 8440 | */ | ||
| 8441 | for_each_cpu(i, cpu_map) { | ||
| 8442 | struct sched_domain *sd = NULL, *p; | ||
| 8443 | |||
| 8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
| 8445 | |||
| 8446 | #ifdef CONFIG_NUMA | ||
| 8447 | if (cpumask_weight(cpu_map) > | ||
| 8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
| 8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 8450 | SD_INIT(sd, ALLNODES); | ||
| 8451 | set_domain_attribute(sd, attr); | ||
| 8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
| 8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8454 | p = sd; | ||
| 8455 | sd_allnodes = 1; | ||
| 8456 | } else | ||
| 8457 | p = NULL; | ||
| 8458 | 8795 | ||
| 8459 | sd = &per_cpu(node_domains, i).sd; | 8796 | d->sd_allnodes = 0; |
| 8460 | SD_INIT(sd, NODE); | 8797 | if (cpumask_weight(cpu_map) > |
| 8798 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
| 8799 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 8800 | SD_INIT(sd, ALLNODES); | ||
| 8461 | set_domain_attribute(sd, attr); | 8801 | set_domain_attribute(sd, attr); |
| 8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8802 | cpumask_copy(sched_domain_span(sd), cpu_map); |
| 8463 | sd->parent = p; | 8803 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8464 | if (p) | 8804 | d->sd_allnodes = 1; |
| 8465 | p->child = sd; | 8805 | } |
| 8466 | cpumask_and(sched_domain_span(sd), | 8806 | parent = sd; |
| 8467 | sched_domain_span(sd), cpu_map); | 8807 | |
| 8808 | sd = &per_cpu(node_domains, i).sd; | ||
| 8809 | SD_INIT(sd, NODE); | ||
| 8810 | set_domain_attribute(sd, attr); | ||
| 8811 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
| 8812 | sd->parent = parent; | ||
| 8813 | if (parent) | ||
| 8814 | parent->child = sd; | ||
| 8815 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
| 8468 | #endif | 8816 | #endif |
| 8817 | return sd; | ||
| 8818 | } | ||
| 8469 | 8819 | ||
| 8470 | p = sd; | 8820 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
| 8471 | sd = &per_cpu(phys_domains, i).sd; | 8821 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 8472 | SD_INIT(sd, CPU); | 8822 | struct sched_domain *parent, int i) |
| 8473 | set_domain_attribute(sd, attr); | 8823 | { |
| 8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8824 | struct sched_domain *sd; |
| 8475 | sd->parent = p; | 8825 | sd = &per_cpu(phys_domains, i).sd; |
| 8476 | if (p) | 8826 | SD_INIT(sd, CPU); |
| 8477 | p->child = sd; | 8827 | set_domain_attribute(sd, attr); |
| 8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8828 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
| 8829 | sd->parent = parent; | ||
| 8830 | if (parent) | ||
| 8831 | parent->child = sd; | ||
| 8832 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 8833 | return sd; | ||
| 8834 | } | ||
| 8479 | 8835 | ||
| 8836 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
| 8837 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8838 | struct sched_domain *parent, int i) | ||
| 8839 | { | ||
| 8840 | struct sched_domain *sd = parent; | ||
| 8480 | #ifdef CONFIG_SCHED_MC | 8841 | #ifdef CONFIG_SCHED_MC |
| 8481 | p = sd; | 8842 | sd = &per_cpu(core_domains, i).sd; |
| 8482 | sd = &per_cpu(core_domains, i).sd; | 8843 | SD_INIT(sd, MC); |
| 8483 | SD_INIT(sd, MC); | 8844 | set_domain_attribute(sd, attr); |
| 8484 | set_domain_attribute(sd, attr); | 8845 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
| 8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8846 | sd->parent = parent; |
| 8486 | cpu_coregroup_mask(i)); | 8847 | parent->child = sd; |
| 8487 | sd->parent = p; | 8848 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8488 | p->child = sd; | ||
| 8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8490 | #endif | 8849 | #endif |
| 8850 | return sd; | ||
| 8851 | } | ||
| 8491 | 8852 | ||
| 8853 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
| 8854 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8855 | struct sched_domain *parent, int i) | ||
| 8856 | { | ||
| 8857 | struct sched_domain *sd = parent; | ||
| 8492 | #ifdef CONFIG_SCHED_SMT | 8858 | #ifdef CONFIG_SCHED_SMT |
| 8493 | p = sd; | 8859 | sd = &per_cpu(cpu_domains, i).sd; |
| 8494 | sd = &per_cpu(cpu_domains, i).sd; | 8860 | SD_INIT(sd, SIBLING); |
| 8495 | SD_INIT(sd, SIBLING); | 8861 | set_domain_attribute(sd, attr); |
| 8496 | set_domain_attribute(sd, attr); | 8862 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
| 8497 | cpumask_and(sched_domain_span(sd), | 8863 | sd->parent = parent; |
| 8498 | topology_thread_cpumask(i), cpu_map); | 8864 | parent->child = sd; |
| 8499 | sd->parent = p; | 8865 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8500 | p->child = sd; | ||
| 8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8502 | #endif | 8866 | #endif |
| 8503 | } | 8867 | return sd; |
| 8868 | } | ||
| 8504 | 8869 | ||
| 8870 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
| 8871 | const struct cpumask *cpu_map, int cpu) | ||
| 8872 | { | ||
| 8873 | switch (l) { | ||
| 8505 | #ifdef CONFIG_SCHED_SMT | 8874 | #ifdef CONFIG_SCHED_SMT |
| 8506 | /* Set up CPU (sibling) groups */ | 8875 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
| 8507 | for_each_cpu(i, cpu_map) { | 8876 | cpumask_and(d->this_sibling_map, cpu_map, |
| 8508 | cpumask_and(this_sibling_map, | 8877 | topology_thread_cpumask(cpu)); |
| 8509 | topology_thread_cpumask(i), cpu_map); | 8878 | if (cpu == cpumask_first(d->this_sibling_map)) |
| 8510 | if (i != cpumask_first(this_sibling_map)) | 8879 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
| 8511 | continue; | 8880 | &cpu_to_cpu_group, |
| 8512 | 8881 | d->send_covered, d->tmpmask); | |
| 8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8882 | break; |
| 8514 | &cpu_to_cpu_group, | ||
| 8515 | send_covered, tmpmask); | ||
| 8516 | } | ||
| 8517 | #endif | 8883 | #endif |
| 8518 | |||
| 8519 | #ifdef CONFIG_SCHED_MC | 8884 | #ifdef CONFIG_SCHED_MC |
| 8520 | /* Set up multi-core groups */ | 8885 | case SD_LV_MC: /* set up multi-core groups */ |
| 8521 | for_each_cpu(i, cpu_map) { | 8886 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
| 8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8887 | if (cpu == cpumask_first(d->this_core_map)) |
| 8523 | if (i != cpumask_first(this_core_map)) | 8888 | init_sched_build_groups(d->this_core_map, cpu_map, |
| 8524 | continue; | 8889 | &cpu_to_core_group, |
| 8525 | 8890 | d->send_covered, d->tmpmask); | |
| 8526 | init_sched_build_groups(this_core_map, cpu_map, | 8891 | break; |
| 8527 | &cpu_to_core_group, | ||
| 8528 | send_covered, tmpmask); | ||
| 8529 | } | ||
| 8530 | #endif | 8892 | #endif |
| 8531 | 8893 | case SD_LV_CPU: /* set up physical groups */ | |
| 8532 | /* Set up physical groups */ | 8894 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
| 8533 | for (i = 0; i < nr_node_ids; i++) { | 8895 | if (!cpumask_empty(d->nodemask)) |
| 8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8896 | init_sched_build_groups(d->nodemask, cpu_map, |
| 8535 | if (cpumask_empty(nodemask)) | 8897 | &cpu_to_phys_group, |
| 8536 | continue; | 8898 | d->send_covered, d->tmpmask); |
| 8537 | 8899 | break; | |
| 8538 | init_sched_build_groups(nodemask, cpu_map, | ||
| 8539 | &cpu_to_phys_group, | ||
| 8540 | send_covered, tmpmask); | ||
| 8541 | } | ||
| 8542 | |||
| 8543 | #ifdef CONFIG_NUMA | 8900 | #ifdef CONFIG_NUMA |
| 8544 | /* Set up node groups */ | 8901 | case SD_LV_ALLNODES: |
| 8545 | if (sd_allnodes) { | 8902 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
| 8546 | init_sched_build_groups(cpu_map, cpu_map, | 8903 | d->send_covered, d->tmpmask); |
| 8547 | &cpu_to_allnodes_group, | 8904 | break; |
| 8548 | send_covered, tmpmask); | 8905 | #endif |
| 8906 | default: | ||
| 8907 | break; | ||
| 8549 | } | 8908 | } |
| 8909 | } | ||
| 8550 | 8910 | ||
| 8551 | for (i = 0; i < nr_node_ids; i++) { | 8911 | /* |
| 8552 | /* Set up node groups */ | 8912 | * Build sched domains for a given set of cpus and attach the sched domains |
| 8553 | struct sched_group *sg, *prev; | 8913 | * to the individual cpus |
| 8554 | int j; | 8914 | */ |
| 8555 | 8915 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
| 8556 | cpumask_clear(covered); | 8916 | struct sched_domain_attr *attr) |
| 8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8917 | { |
| 8558 | if (cpumask_empty(nodemask)) { | 8918 | enum s_alloc alloc_state = sa_none; |
| 8559 | sched_group_nodes[i] = NULL; | 8919 | struct s_data d; |
| 8560 | continue; | 8920 | struct sched_domain *sd; |
| 8561 | } | 8921 | int i; |
| 8922 | #ifdef CONFIG_NUMA | ||
| 8923 | d.sd_allnodes = 0; | ||
| 8924 | #endif | ||
| 8562 | 8925 | ||
| 8563 | sched_domain_node_span(i, domainspan); | 8926 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
| 8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8927 | if (alloc_state != sa_rootdomain) |
| 8928 | goto error; | ||
| 8929 | alloc_state = sa_sched_groups; | ||
| 8565 | 8930 | ||
| 8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8931 | /* |
| 8567 | GFP_KERNEL, i); | 8932 | * Set up domains for cpus specified by the cpu_map. |
| 8568 | if (!sg) { | 8933 | */ |
| 8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8934 | for_each_cpu(i, cpu_map) { |
| 8570 | "node %d\n", i); | 8935 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
| 8571 | goto error; | 8936 | cpu_map); |
| 8572 | } | ||
| 8573 | sched_group_nodes[i] = sg; | ||
| 8574 | for_each_cpu(j, nodemask) { | ||
| 8575 | struct sched_domain *sd; | ||
| 8576 | 8937 | ||
| 8577 | sd = &per_cpu(node_domains, j).sd; | 8938 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
| 8578 | sd->groups = sg; | 8939 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
| 8579 | } | 8940 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
| 8580 | sg->__cpu_power = 0; | 8941 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
| 8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8942 | } |
| 8582 | sg->next = sg; | ||
| 8583 | cpumask_or(covered, covered, nodemask); | ||
| 8584 | prev = sg; | ||
| 8585 | 8943 | ||
| 8586 | for (j = 0; j < nr_node_ids; j++) { | 8944 | for_each_cpu(i, cpu_map) { |
| 8587 | int n = (i + j) % nr_node_ids; | 8945 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
| 8946 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
| 8947 | } | ||
| 8588 | 8948 | ||
| 8589 | cpumask_complement(notcovered, covered); | 8949 | /* Set up physical groups */ |
| 8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8950 | for (i = 0; i < nr_node_ids; i++) |
| 8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8951 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
| 8592 | if (cpumask_empty(tmpmask)) | ||
| 8593 | break; | ||
| 8594 | 8952 | ||
| 8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8953 | #ifdef CONFIG_NUMA |
| 8596 | if (cpumask_empty(tmpmask)) | 8954 | /* Set up node groups */ |
| 8597 | continue; | 8955 | if (d.sd_allnodes) |
| 8956 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
| 8598 | 8957 | ||
| 8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8958 | for (i = 0; i < nr_node_ids; i++) |
| 8600 | cpumask_size(), | 8959 | if (build_numa_sched_groups(&d, cpu_map, i)) |
| 8601 | GFP_KERNEL, i); | 8960 | goto error; |
| 8602 | if (!sg) { | ||
| 8603 | printk(KERN_WARNING | ||
| 8604 | "Can not alloc domain group for node %d\n", j); | ||
| 8605 | goto error; | ||
| 8606 | } | ||
| 8607 | sg->__cpu_power = 0; | ||
| 8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
| 8609 | sg->next = prev->next; | ||
| 8610 | cpumask_or(covered, covered, tmpmask); | ||
| 8611 | prev->next = sg; | ||
| 8612 | prev = sg; | ||
| 8613 | } | ||
| 8614 | } | ||
| 8615 | #endif | 8961 | #endif |
| 8616 | 8962 | ||
| 8617 | /* Calculate CPU power for physical packages and nodes */ | 8963 | /* Calculate CPU power for physical packages and nodes */ |
| 8618 | #ifdef CONFIG_SCHED_SMT | 8964 | #ifdef CONFIG_SCHED_SMT |
| 8619 | for_each_cpu(i, cpu_map) { | 8965 | for_each_cpu(i, cpu_map) { |
| 8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8966 | sd = &per_cpu(cpu_domains, i).sd; |
| 8621 | |||
| 8622 | init_sched_groups_power(i, sd); | 8967 | init_sched_groups_power(i, sd); |
| 8623 | } | 8968 | } |
| 8624 | #endif | 8969 | #endif |
| 8625 | #ifdef CONFIG_SCHED_MC | 8970 | #ifdef CONFIG_SCHED_MC |
| 8626 | for_each_cpu(i, cpu_map) { | 8971 | for_each_cpu(i, cpu_map) { |
| 8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8972 | sd = &per_cpu(core_domains, i).sd; |
| 8628 | |||
| 8629 | init_sched_groups_power(i, sd); | 8973 | init_sched_groups_power(i, sd); |
| 8630 | } | 8974 | } |
| 8631 | #endif | 8975 | #endif |
| 8632 | 8976 | ||
| 8633 | for_each_cpu(i, cpu_map) { | 8977 | for_each_cpu(i, cpu_map) { |
| 8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8978 | sd = &per_cpu(phys_domains, i).sd; |
| 8635 | |||
| 8636 | init_sched_groups_power(i, sd); | 8979 | init_sched_groups_power(i, sd); |
| 8637 | } | 8980 | } |
| 8638 | 8981 | ||
| 8639 | #ifdef CONFIG_NUMA | 8982 | #ifdef CONFIG_NUMA |
| 8640 | for (i = 0; i < nr_node_ids; i++) | 8983 | for (i = 0; i < nr_node_ids; i++) |
| 8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8984 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
| 8642 | 8985 | ||
| 8643 | if (sd_allnodes) { | 8986 | if (d.sd_allnodes) { |
| 8644 | struct sched_group *sg; | 8987 | struct sched_group *sg; |
| 8645 | 8988 | ||
| 8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8989 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
| 8647 | tmpmask); | 8990 | d.tmpmask); |
| 8648 | init_numa_sched_groups_power(sg); | 8991 | init_numa_sched_groups_power(sg); |
| 8649 | } | 8992 | } |
| 8650 | #endif | 8993 | #endif |
| 8651 | 8994 | ||
| 8652 | /* Attach the domains */ | 8995 | /* Attach the domains */ |
| 8653 | for_each_cpu(i, cpu_map) { | 8996 | for_each_cpu(i, cpu_map) { |
| 8654 | struct sched_domain *sd; | ||
| 8655 | #ifdef CONFIG_SCHED_SMT | 8997 | #ifdef CONFIG_SCHED_SMT |
| 8656 | sd = &per_cpu(cpu_domains, i).sd; | 8998 | sd = &per_cpu(cpu_domains, i).sd; |
| 8657 | #elif defined(CONFIG_SCHED_MC) | 8999 | #elif defined(CONFIG_SCHED_MC) |
| @@ -8659,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 8659 | #else | 9001 | #else |
| 8660 | sd = &per_cpu(phys_domains, i).sd; | 9002 | sd = &per_cpu(phys_domains, i).sd; |
| 8661 | #endif | 9003 | #endif |
| 8662 | cpu_attach_domain(sd, rd, i); | 9004 | cpu_attach_domain(sd, d.rd, i); |
| 8663 | } | 9005 | } |
| 8664 | 9006 | ||
| 8665 | err = 0; | 9007 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
| 8666 | 9008 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
| 8667 | free_tmpmask: | 9009 | return 0; |
| 8668 | free_cpumask_var(tmpmask); | ||
| 8669 | free_send_covered: | ||
| 8670 | free_cpumask_var(send_covered); | ||
| 8671 | free_this_core_map: | ||
| 8672 | free_cpumask_var(this_core_map); | ||
| 8673 | free_this_sibling_map: | ||
| 8674 | free_cpumask_var(this_sibling_map); | ||
| 8675 | free_nodemask: | ||
| 8676 | free_cpumask_var(nodemask); | ||
| 8677 | free_notcovered: | ||
| 8678 | #ifdef CONFIG_NUMA | ||
| 8679 | free_cpumask_var(notcovered); | ||
| 8680 | free_covered: | ||
| 8681 | free_cpumask_var(covered); | ||
| 8682 | free_domainspan: | ||
| 8683 | free_cpumask_var(domainspan); | ||
| 8684 | out: | ||
| 8685 | #endif | ||
| 8686 | return err; | ||
| 8687 | |||
| 8688 | free_sched_groups: | ||
| 8689 | #ifdef CONFIG_NUMA | ||
| 8690 | kfree(sched_group_nodes); | ||
| 8691 | #endif | ||
| 8692 | goto free_tmpmask; | ||
| 8693 | 9010 | ||
| 8694 | #ifdef CONFIG_NUMA | ||
| 8695 | error: | 9011 | error: |
| 8696 | free_sched_groups(cpu_map, tmpmask); | 9012 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| 8697 | free_rootdomain(rd); | 9013 | return -ENOMEM; |
| 8698 | goto free_tmpmask; | ||
| 8699 | #endif | ||
| 8700 | } | 9014 | } |
| 8701 | 9015 | ||
| 8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 9016 | static int build_sched_domains(const struct cpumask *cpu_map) |
| @@ -9304,11 +9618,11 @@ void __init sched_init(void) | |||
| 9304 | * system cpu resource, based on the weight assigned to root | 9618 | * system cpu resource, based on the weight assigned to root |
| 9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9619 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
| 9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9620 | * by letting tasks of init_task_group sit in a separate cfs_rq |
| 9307 | * (init_cfs_rq) and having one entity represent this group of | 9621 | * (init_tg_cfs_rq) and having one entity represent this group of |
| 9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9622 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
| 9309 | */ | 9623 | */ |
| 9310 | init_tg_cfs_entry(&init_task_group, | 9624 | init_tg_cfs_entry(&init_task_group, |
| 9311 | &per_cpu(init_cfs_rq, i), | 9625 | &per_cpu(init_tg_cfs_rq, i), |
| 9312 | &per_cpu(init_sched_entity, i), i, 1, | 9626 | &per_cpu(init_sched_entity, i), i, 1, |
| 9313 | root_task_group.se[i]); | 9627 | root_task_group.se[i]); |
| 9314 | 9628 | ||
| @@ -9334,6 +9648,7 @@ void __init sched_init(void) | |||
| 9334 | #ifdef CONFIG_SMP | 9648 | #ifdef CONFIG_SMP |
| 9335 | rq->sd = NULL; | 9649 | rq->sd = NULL; |
| 9336 | rq->rd = NULL; | 9650 | rq->rd = NULL; |
| 9651 | rq->post_schedule = 0; | ||
| 9337 | rq->active_balance = 0; | 9652 | rq->active_balance = 0; |
| 9338 | rq->next_balance = jiffies; | 9653 | rq->next_balance = jiffies; |
| 9339 | rq->push_cpu = 0; | 9654 | rq->push_cpu = 0; |
| @@ -9398,13 +9713,20 @@ void __init sched_init(void) | |||
| 9398 | } | 9713 | } |
| 9399 | 9714 | ||
| 9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9715 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 9401 | void __might_sleep(char *file, int line) | 9716 | static inline int preempt_count_equals(int preempt_offset) |
| 9717 | { | ||
| 9718 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
| 9719 | |||
| 9720 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
| 9721 | } | ||
| 9722 | |||
| 9723 | void __might_sleep(char *file, int line, int preempt_offset) | ||
| 9402 | { | 9724 | { |
| 9403 | #ifdef in_atomic | 9725 | #ifdef in_atomic |
| 9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9726 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 9405 | 9727 | ||
| 9406 | if ((!in_atomic() && !irqs_disabled()) || | 9728 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
| 9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9729 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 9408 | return; | 9730 | return; |
| 9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9731 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 9410 | return; | 9732 | return; |
| @@ -10581,3 +10903,113 @@ struct cgroup_subsys cpuacct_subsys = { | |||
| 10581 | .subsys_id = cpuacct_subsys_id, | 10903 | .subsys_id = cpuacct_subsys_id, |
| 10582 | }; | 10904 | }; |
| 10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10905 | #endif /* CONFIG_CGROUP_CPUACCT */ |
| 10906 | |||
| 10907 | #ifndef CONFIG_SMP | ||
| 10908 | |||
| 10909 | int rcu_expedited_torture_stats(char *page) | ||
| 10910 | { | ||
| 10911 | return 0; | ||
| 10912 | } | ||
| 10913 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
| 10914 | |||
| 10915 | void synchronize_sched_expedited(void) | ||
| 10916 | { | ||
| 10917 | } | ||
| 10918 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 10919 | |||
| 10920 | #else /* #ifndef CONFIG_SMP */ | ||
| 10921 | |||
| 10922 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
| 10923 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
| 10924 | |||
| 10925 | #define RCU_EXPEDITED_STATE_POST -2 | ||
| 10926 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
| 10927 | |||
| 10928 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
| 10929 | |||
| 10930 | int rcu_expedited_torture_stats(char *page) | ||
| 10931 | { | ||
| 10932 | int cnt = 0; | ||
| 10933 | int cpu; | ||
| 10934 | |||
| 10935 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
| 10936 | for_each_online_cpu(cpu) { | ||
| 10937 | cnt += sprintf(&page[cnt], " %d:%d", | ||
| 10938 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
| 10939 | } | ||
| 10940 | cnt += sprintf(&page[cnt], "\n"); | ||
| 10941 | return cnt; | ||
| 10942 | } | ||
| 10943 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
| 10944 | |||
| 10945 | static long synchronize_sched_expedited_count; | ||
| 10946 | |||
| 10947 | /* | ||
| 10948 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
| 10949 | * approach to force grace period to end quickly. This consumes | ||
| 10950 | * significant time on all CPUs, and is thus not recommended for | ||
| 10951 | * any sort of common-case code. | ||
| 10952 | * | ||
| 10953 | * Note that it is illegal to call this function while holding any | ||
| 10954 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
| 10955 | * observe this restriction will result in deadlock. | ||
| 10956 | */ | ||
| 10957 | void synchronize_sched_expedited(void) | ||
| 10958 | { | ||
| 10959 | int cpu; | ||
| 10960 | unsigned long flags; | ||
| 10961 | bool need_full_sync = 0; | ||
| 10962 | struct rq *rq; | ||
| 10963 | struct migration_req *req; | ||
| 10964 | long snap; | ||
| 10965 | int trycount = 0; | ||
| 10966 | |||
| 10967 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
| 10968 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
| 10969 | get_online_cpus(); | ||
| 10970 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
| 10971 | put_online_cpus(); | ||
| 10972 | if (trycount++ < 10) | ||
| 10973 | udelay(trycount * num_online_cpus()); | ||
| 10974 | else { | ||
| 10975 | synchronize_sched(); | ||
| 10976 | return; | ||
| 10977 | } | ||
| 10978 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
| 10979 | smp_mb(); /* ensure test happens before caller kfree */ | ||
| 10980 | return; | ||
| 10981 | } | ||
| 10982 | get_online_cpus(); | ||
| 10983 | } | ||
| 10984 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
| 10985 | for_each_online_cpu(cpu) { | ||
| 10986 | rq = cpu_rq(cpu); | ||
| 10987 | req = &per_cpu(rcu_migration_req, cpu); | ||
| 10988 | init_completion(&req->done); | ||
| 10989 | req->task = NULL; | ||
| 10990 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
| 10991 | spin_lock_irqsave(&rq->lock, flags); | ||
| 10992 | list_add(&req->list, &rq->migration_queue); | ||
| 10993 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 10994 | wake_up_process(rq->migration_thread); | ||
| 10995 | } | ||
| 10996 | for_each_online_cpu(cpu) { | ||
| 10997 | rcu_expedited_state = cpu; | ||
| 10998 | req = &per_cpu(rcu_migration_req, cpu); | ||
| 10999 | rq = cpu_rq(cpu); | ||
| 11000 | wait_for_completion(&req->done); | ||
| 11001 | spin_lock_irqsave(&rq->lock, flags); | ||
| 11002 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
| 11003 | need_full_sync = 1; | ||
| 11004 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
| 11005 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 11006 | } | ||
| 11007 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
| 11008 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
| 11009 | put_online_cpus(); | ||
| 11010 | if (need_full_sync) | ||
| 11011 | synchronize_sched(); | ||
| 11012 | } | ||
| 11013 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 11014 | |||
| 11015 | #endif /* #else #ifndef CONFIG_SMP */ | ||
