diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1234 |
1 files changed, 833 insertions, 401 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..d9db3fb17573 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -64,7 +64,6 @@ | |||
64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
67 | #include <linux/reciprocal_div.h> | ||
68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
@@ -120,30 +119,8 @@ | |||
120 | */ | 119 | */ |
121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
122 | 121 | ||
123 | #ifdef CONFIG_SMP | ||
124 | |||
125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 122 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
126 | 123 | ||
127 | /* | ||
128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
130 | */ | ||
131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
132 | { | ||
133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Each time a sched group cpu_power is changed, | ||
138 | * we must compute its reciprocal value | ||
139 | */ | ||
140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
141 | { | ||
142 | sg->__cpu_power += val; | ||
143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline int rt_policy(int policy) | 124 | static inline int rt_policy(int policy) |
148 | { | 125 | { |
149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 126 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 286 | ||
310 | /* | 287 | /* |
311 | * Root task group. | 288 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 289 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 290 | * be a child to this group. |
314 | */ | 291 | */ |
315 | struct task_group root_task_group; | 292 | struct task_group root_task_group; |
316 | 293 | ||
@@ -318,12 +295,12 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 295 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 296 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 297 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 298 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 299 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 300 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 301 | #ifdef CONFIG_RT_GROUP_SCHED |
325 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 302 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
326 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 303 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); |
327 | #endif /* CONFIG_RT_GROUP_SCHED */ | 304 | #endif /* CONFIG_RT_GROUP_SCHED */ |
328 | #else /* !CONFIG_USER_SCHED */ | 305 | #else /* !CONFIG_USER_SCHED */ |
329 | #define root_task_group init_task_group | 306 | #define root_task_group init_task_group |
@@ -616,6 +593,7 @@ struct rq { | |||
616 | 593 | ||
617 | unsigned char idle_at_tick; | 594 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 595 | /* For active balancing */ |
596 | int post_schedule; | ||
619 | int active_balance; | 597 | int active_balance; |
620 | int push_cpu; | 598 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 599 | /* cpu of this runqueue: */ |
@@ -626,6 +604,9 @@ struct rq { | |||
626 | 604 | ||
627 | struct task_struct *migration_thread; | 605 | struct task_struct *migration_thread; |
628 | struct list_head migration_queue; | 606 | struct list_head migration_queue; |
607 | |||
608 | u64 rt_avg; | ||
609 | u64 age_stamp; | ||
629 | #endif | 610 | #endif |
630 | 611 | ||
631 | /* calc_load related fields */ | 612 | /* calc_load related fields */ |
@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 674 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 675 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 676 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
677 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 678 | ||
697 | inline void update_rq_clock(struct rq *rq) | 679 | inline void update_rq_clock(struct rq *rq) |
698 | { | 680 | { |
@@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
861 | unsigned int sysctl_sched_shares_thresh = 4; | 843 | unsigned int sysctl_sched_shares_thresh = 4; |
862 | 844 | ||
863 | /* | 845 | /* |
846 | * period over which we average the RT time consumption, measured | ||
847 | * in ms. | ||
848 | * | ||
849 | * default: 1s | ||
850 | */ | ||
851 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
852 | |||
853 | /* | ||
864 | * period over which we measure -rt task cpu usage in us. | 854 | * period over which we measure -rt task cpu usage in us. |
865 | * default: 1s | 855 | * default: 1s |
866 | */ | 856 | */ |
@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) | |||
1278 | } | 1268 | } |
1279 | #endif /* CONFIG_NO_HZ */ | 1269 | #endif /* CONFIG_NO_HZ */ |
1280 | 1270 | ||
1271 | static u64 sched_avg_period(void) | ||
1272 | { | ||
1273 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1274 | } | ||
1275 | |||
1276 | static void sched_avg_update(struct rq *rq) | ||
1277 | { | ||
1278 | s64 period = sched_avg_period(); | ||
1279 | |||
1280 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
1281 | rq->age_stamp += period; | ||
1282 | rq->rt_avg /= 2; | ||
1283 | } | ||
1284 | } | ||
1285 | |||
1286 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1287 | { | ||
1288 | rq->rt_avg += rt_delta; | ||
1289 | sched_avg_update(rq); | ||
1290 | } | ||
1291 | |||
1281 | #else /* !CONFIG_SMP */ | 1292 | #else /* !CONFIG_SMP */ |
1282 | static void resched_task(struct task_struct *p) | 1293 | static void resched_task(struct task_struct *p) |
1283 | { | 1294 | { |
1284 | assert_spin_locked(&task_rq(p)->lock); | 1295 | assert_spin_locked(&task_rq(p)->lock); |
1285 | set_tsk_need_resched(p); | 1296 | set_tsk_need_resched(p); |
1286 | } | 1297 | } |
1298 | |||
1299 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1300 | { | ||
1301 | } | ||
1287 | #endif /* CONFIG_SMP */ | 1302 | #endif /* CONFIG_SMP */ |
1288 | 1303 | ||
1289 | #if BITS_PER_LONG == 32 | 1304 | #if BITS_PER_LONG == 32 |
@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1528 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1529 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1530 | ||
1531 | struct update_shares_data { | ||
1532 | unsigned long rq_weight[NR_CPUS]; | ||
1533 | }; | ||
1534 | |||
1535 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1536 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1537 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1538 | ||
1518 | /* | 1539 | /* |
1519 | * Calculate and set the cpu's group shares. | 1540 | * Calculate and set the cpu's group shares. |
1520 | */ | 1541 | */ |
1521 | static void | 1542 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1543 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1544 | unsigned long sd_rq_weight, |
1545 | struct update_shares_data *usd) | ||
1524 | { | 1546 | { |
1525 | unsigned long shares; | 1547 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1548 | int boost = 0; |
1527 | |||
1528 | if (!tg->se[cpu]) | ||
1529 | return; | ||
1530 | 1549 | ||
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1550 | rq_weight = usd->rq_weight[cpu]; |
1551 | if (!rq_weight) { | ||
1552 | boost = 1; | ||
1553 | rq_weight = NICE_0_LOAD; | ||
1554 | } | ||
1532 | 1555 | ||
1533 | /* | 1556 | /* |
1534 | * \Sum shares * rq_weight | 1557 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1558 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1559 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1560 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1561 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1562 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1567 | unsigned long flags; |
1546 | 1568 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1569 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1570 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1571 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1572 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1573 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1574 | } |
@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1581 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1582 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1583 | { |
1562 | unsigned long weight, rq_weight = 0; | 1584 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1585 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1586 | struct sched_domain *sd = data; |
1587 | unsigned long flags; | ||
1565 | int i; | 1588 | int i; |
1566 | 1589 | ||
1590 | if (!tg->se[0]) | ||
1591 | return 0; | ||
1592 | |||
1593 | local_irq_save(flags); | ||
1594 | usd = &__get_cpu_var(update_shares_data); | ||
1595 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1596 | for_each_cpu(i, sched_domain_span(sd)) { |
1597 | weight = tg->cfs_rq[i]->load.weight; | ||
1598 | usd->rq_weight[i] = weight; | ||
1599 | |||
1568 | /* | 1600 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1601 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1602 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1603 | * run here it will not get delayed by group starvation. |
1572 | */ | 1604 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1605 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1606 | weight = NICE_0_LOAD; |
1576 | 1607 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1608 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1609 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1610 | } |
@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1616 | shares = tg->shares; |
1587 | 1617 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1618 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1619 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1620 | |||
1621 | local_irq_restore(flags); | ||
1590 | 1622 | ||
1591 | return 0; | 1623 | return 0; |
1592 | } | 1624 | } |
@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1648 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1649 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1650 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1651 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1652 | u64 now; |
1653 | |||
1654 | if (root_task_group_empty()) | ||
1655 | return; | ||
1656 | |||
1657 | now = cpu_clock(raw_smp_processor_id()); | ||
1658 | elapsed = now - sd->last_update; | ||
1621 | 1659 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1660 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1661 | sd->last_update = now; |
@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1665 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1666 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1667 | { |
1668 | if (root_task_group_empty()) | ||
1669 | return; | ||
1670 | |||
1630 | spin_unlock(&rq->lock); | 1671 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1672 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1673 | spin_lock(&rq->lock); |
@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1675 | ||
1635 | static void update_h_load(long cpu) | 1676 | static void update_h_load(long cpu) |
1636 | { | 1677 | { |
1678 | if (root_task_group_empty()) | ||
1679 | return; | ||
1680 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1681 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1682 | } |
1639 | 1683 | ||
@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
2268 | } | 2312 | } |
2269 | 2313 | ||
2270 | /* Adjust by relative CPU power of the group */ | 2314 | /* Adjust by relative CPU power of the group */ |
2271 | avg_load = sg_div_cpu_power(group, | 2315 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2272 | avg_load * SCHED_LOAD_SCALE); | ||
2273 | 2316 | ||
2274 | if (local_group) { | 2317 | if (local_group) { |
2275 | this_load = avg_load; | 2318 | this_load = avg_load; |
@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2637 | set_task_cpu(p, cpu); | 2680 | set_task_cpu(p, cpu); |
2638 | 2681 | ||
2639 | /* | 2682 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2683 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2684 | */ |
2642 | p->prio = current->normal_prio; | 2685 | p->prio = current->normal_prio; |
2686 | |||
2687 | /* | ||
2688 | * Revert to default priority/policy on fork if requested. | ||
2689 | */ | ||
2690 | if (unlikely(p->sched_reset_on_fork)) { | ||
2691 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
2692 | p->policy = SCHED_NORMAL; | ||
2693 | |||
2694 | if (p->normal_prio < DEFAULT_PRIO) | ||
2695 | p->prio = DEFAULT_PRIO; | ||
2696 | |||
2697 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2698 | p->static_prio = NICE_TO_PRIO(0); | ||
2699 | set_load_weight(p); | ||
2700 | } | ||
2701 | |||
2702 | /* | ||
2703 | * We don't need the reset flag anymore after the fork. It has | ||
2704 | * fulfilled its duty: | ||
2705 | */ | ||
2706 | p->sched_reset_on_fork = 0; | ||
2707 | } | ||
2708 | |||
2643 | if (!rt_prio(p->prio)) | 2709 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2710 | p->sched_class = &fair_sched_class; |
2645 | 2711 | ||
@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2862 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2863 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2864 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2865 | ||
2806 | rq->prev_mm = NULL; | 2866 | rq->prev_mm = NULL; |
2807 | 2867 | ||
@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2820 | finish_arch_switch(prev); | 2880 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2881 | perf_counter_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2882 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2883 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2884 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2885 | if (mm) |
@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2894 | } |
2839 | } | 2895 | } |
2840 | 2896 | ||
2897 | #ifdef CONFIG_SMP | ||
2898 | |||
2899 | /* assumes rq->lock is held */ | ||
2900 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2901 | { | ||
2902 | if (prev->sched_class->pre_schedule) | ||
2903 | prev->sched_class->pre_schedule(rq, prev); | ||
2904 | } | ||
2905 | |||
2906 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2907 | static inline void post_schedule(struct rq *rq) | ||
2908 | { | ||
2909 | if (rq->post_schedule) { | ||
2910 | unsigned long flags; | ||
2911 | |||
2912 | spin_lock_irqsave(&rq->lock, flags); | ||
2913 | if (rq->curr->sched_class->post_schedule) | ||
2914 | rq->curr->sched_class->post_schedule(rq); | ||
2915 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2916 | |||
2917 | rq->post_schedule = 0; | ||
2918 | } | ||
2919 | } | ||
2920 | |||
2921 | #else | ||
2922 | |||
2923 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2924 | { | ||
2925 | } | ||
2926 | |||
2927 | static inline void post_schedule(struct rq *rq) | ||
2928 | { | ||
2929 | } | ||
2930 | |||
2931 | #endif | ||
2932 | |||
2841 | /** | 2933 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2934 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2935 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2940 | struct rq *rq = this_rq(); |
2849 | 2941 | ||
2850 | finish_task_switch(rq, prev); | 2942 | finish_task_switch(rq, prev); |
2943 | |||
2944 | /* | ||
2945 | * FIXME: do we need to worry about rq being invalidated by the | ||
2946 | * task_switch? | ||
2947 | */ | ||
2948 | post_schedule(rq); | ||
2949 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2950 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2951 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2952 | preempt_enable(); |
@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3478 | { |
3380 | const struct sched_class *class; | 3479 | const struct sched_class *class; |
3381 | 3480 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3481 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3482 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3483 | return 1; |
3484 | } | ||
3385 | 3485 | ||
3386 | return 0; | 3486 | return 0; |
3387 | } | 3487 | } |
@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
3544 | * capacity but still has some space to pick up some load | 3644 | * capacity but still has some space to pick up some load |
3545 | * from other group and save more power | 3645 | * from other group and save more power |
3546 | */ | 3646 | */ |
3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3647 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
3548 | return; | 3648 | return; |
3549 | 3649 | ||
3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3650 | if (sgs->sum_nr_running > sds->leader_nr_running || |
@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3611 | } | 3711 | } |
3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3712 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3613 | 3713 | ||
3714 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3715 | { | ||
3716 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3717 | unsigned long smt_gain = sd->smt_gain; | ||
3718 | |||
3719 | smt_gain /= weight; | ||
3720 | |||
3721 | return smt_gain; | ||
3722 | } | ||
3723 | |||
3724 | unsigned long scale_rt_power(int cpu) | ||
3725 | { | ||
3726 | struct rq *rq = cpu_rq(cpu); | ||
3727 | u64 total, available; | ||
3728 | |||
3729 | sched_avg_update(rq); | ||
3730 | |||
3731 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3732 | available = total - rq->rt_avg; | ||
3733 | |||
3734 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3735 | total = SCHED_LOAD_SCALE; | ||
3736 | |||
3737 | total >>= SCHED_LOAD_SHIFT; | ||
3738 | |||
3739 | return div_u64(available, total); | ||
3740 | } | ||
3741 | |||
3742 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3743 | { | ||
3744 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3745 | unsigned long power = SCHED_LOAD_SCALE; | ||
3746 | struct sched_group *sdg = sd->groups; | ||
3747 | |||
3748 | /* here we could scale based on cpufreq */ | ||
3749 | |||
3750 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3751 | power *= arch_scale_smt_power(sd, cpu); | ||
3752 | power >>= SCHED_LOAD_SHIFT; | ||
3753 | } | ||
3754 | |||
3755 | power *= scale_rt_power(cpu); | ||
3756 | power >>= SCHED_LOAD_SHIFT; | ||
3757 | |||
3758 | if (!power) | ||
3759 | power = 1; | ||
3760 | |||
3761 | sdg->cpu_power = power; | ||
3762 | } | ||
3763 | |||
3764 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3765 | { | ||
3766 | struct sched_domain *child = sd->child; | ||
3767 | struct sched_group *group, *sdg = sd->groups; | ||
3768 | unsigned long power; | ||
3769 | |||
3770 | if (!child) { | ||
3771 | update_cpu_power(sd, cpu); | ||
3772 | return; | ||
3773 | } | ||
3774 | |||
3775 | power = 0; | ||
3776 | |||
3777 | group = child->groups; | ||
3778 | do { | ||
3779 | power += group->cpu_power; | ||
3780 | group = group->next; | ||
3781 | } while (group != child->groups); | ||
3782 | |||
3783 | sdg->cpu_power = power; | ||
3784 | } | ||
3614 | 3785 | ||
3615 | /** | 3786 | /** |
3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3787 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3624 | * @balance: Should we balance. | 3795 | * @balance: Should we balance. |
3625 | * @sgs: variable to hold the statistics for this group. | 3796 | * @sgs: variable to hold the statistics for this group. |
3626 | */ | 3797 | */ |
3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3798 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
3799 | struct sched_group *group, int this_cpu, | ||
3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3800 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
3629 | int local_group, const struct cpumask *cpus, | 3801 | int local_group, const struct cpumask *cpus, |
3630 | int *balance, struct sg_lb_stats *sgs) | 3802 | int *balance, struct sg_lb_stats *sgs) |
@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3635 | unsigned long sum_avg_load_per_task; | 3807 | unsigned long sum_avg_load_per_task; |
3636 | unsigned long avg_load_per_task; | 3808 | unsigned long avg_load_per_task; |
3637 | 3809 | ||
3638 | if (local_group) | 3810 | if (local_group) { |
3639 | balance_cpu = group_first_cpu(group); | 3811 | balance_cpu = group_first_cpu(group); |
3812 | if (balance_cpu == this_cpu) | ||
3813 | update_group_power(sd, this_cpu); | ||
3814 | } | ||
3640 | 3815 | ||
3641 | /* Tally up the load of all CPUs in the group */ | 3816 | /* Tally up the load of all CPUs in the group */ |
3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3817 | sum_avg_load_per_task = avg_load_per_task = 0; |
@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3685 | } | 3860 | } |
3686 | 3861 | ||
3687 | /* Adjust by relative CPU power of the group */ | 3862 | /* Adjust by relative CPU power of the group */ |
3688 | sgs->avg_load = sg_div_cpu_power(group, | 3863 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
3690 | 3864 | ||
3691 | 3865 | ||
3692 | /* | 3866 | /* |
@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3698 | * normalized nr_running number somewhere that negates | 3872 | * normalized nr_running number somewhere that negates |
3699 | * the hierarchy? | 3873 | * the hierarchy? |
3700 | */ | 3874 | */ |
3701 | avg_load_per_task = sg_div_cpu_power(group, | 3875 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3876 | group->cpu_power; |
3703 | 3877 | ||
3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3878 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3705 | sgs->group_imb = 1; | 3879 | sgs->group_imb = 1; |
3706 | 3880 | ||
3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3881 | sgs->group_capacity = |
3708 | 3882 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
3709 | } | 3883 | } |
3710 | 3884 | ||
3711 | /** | 3885 | /** |
@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3723 | const struct cpumask *cpus, int *balance, | 3897 | const struct cpumask *cpus, int *balance, |
3724 | struct sd_lb_stats *sds) | 3898 | struct sd_lb_stats *sds) |
3725 | { | 3899 | { |
3900 | struct sched_domain *child = sd->child; | ||
3726 | struct sched_group *group = sd->groups; | 3901 | struct sched_group *group = sd->groups; |
3727 | struct sg_lb_stats sgs; | 3902 | struct sg_lb_stats sgs; |
3728 | int load_idx; | 3903 | int load_idx, prefer_sibling = 0; |
3904 | |||
3905 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3906 | prefer_sibling = 1; | ||
3729 | 3907 | ||
3730 | init_sd_power_savings_stats(sd, sds, idle); | 3908 | init_sd_power_savings_stats(sd, sds, idle); |
3731 | load_idx = get_sd_load_idx(sd, idle); | 3909 | load_idx = get_sd_load_idx(sd, idle); |
@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3736 | local_group = cpumask_test_cpu(this_cpu, | 3914 | local_group = cpumask_test_cpu(this_cpu, |
3737 | sched_group_cpus(group)); | 3915 | sched_group_cpus(group)); |
3738 | memset(&sgs, 0, sizeof(sgs)); | 3916 | memset(&sgs, 0, sizeof(sgs)); |
3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3917 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
3740 | local_group, cpus, balance, &sgs); | 3918 | local_group, cpus, balance, &sgs); |
3741 | 3919 | ||
3742 | if (local_group && balance && !(*balance)) | 3920 | if (local_group && balance && !(*balance)) |
3743 | return; | 3921 | return; |
3744 | 3922 | ||
3745 | sds->total_load += sgs.group_load; | 3923 | sds->total_load += sgs.group_load; |
3746 | sds->total_pwr += group->__cpu_power; | 3924 | sds->total_pwr += group->cpu_power; |
3925 | |||
3926 | /* | ||
3927 | * In case the child domain prefers tasks go to siblings | ||
3928 | * first, lower the group capacity to one so that we'll try | ||
3929 | * and move all the excess tasks away. | ||
3930 | */ | ||
3931 | if (prefer_sibling) | ||
3932 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3747 | 3933 | ||
3748 | if (local_group) { | 3934 | if (local_group) { |
3749 | sds->this_load = sgs.avg_load; | 3935 | sds->this_load = sgs.avg_load; |
@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3949 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3764 | group = group->next; | 3950 | group = group->next; |
3765 | } while (group != sd->groups); | 3951 | } while (group != sd->groups); |
3766 | |||
3767 | } | 3952 | } |
3768 | 3953 | ||
3769 | /** | 3954 | /** |
@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3801 | * moving them. | 3986 | * moving them. |
3802 | */ | 3987 | */ |
3803 | 3988 | ||
3804 | pwr_now += sds->busiest->__cpu_power * | 3989 | pwr_now += sds->busiest->cpu_power * |
3805 | min(sds->busiest_load_per_task, sds->max_load); | 3990 | min(sds->busiest_load_per_task, sds->max_load); |
3806 | pwr_now += sds->this->__cpu_power * | 3991 | pwr_now += sds->this->cpu_power * |
3807 | min(sds->this_load_per_task, sds->this_load); | 3992 | min(sds->this_load_per_task, sds->this_load); |
3808 | pwr_now /= SCHED_LOAD_SCALE; | 3993 | pwr_now /= SCHED_LOAD_SCALE; |
3809 | 3994 | ||
3810 | /* Amount of load we'd subtract */ | 3995 | /* Amount of load we'd subtract */ |
3811 | tmp = sg_div_cpu_power(sds->busiest, | 3996 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3997 | sds->busiest->cpu_power; |
3813 | if (sds->max_load > tmp) | 3998 | if (sds->max_load > tmp) |
3814 | pwr_move += sds->busiest->__cpu_power * | 3999 | pwr_move += sds->busiest->cpu_power * |
3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4000 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
3816 | 4001 | ||
3817 | /* Amount of load we'd add */ | 4002 | /* Amount of load we'd add */ |
3818 | if (sds->max_load * sds->busiest->__cpu_power < | 4003 | if (sds->max_load * sds->busiest->cpu_power < |
3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 4004 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
3820 | tmp = sg_div_cpu_power(sds->this, | 4005 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
3821 | sds->max_load * sds->busiest->__cpu_power); | 4006 | sds->this->cpu_power; |
3822 | else | 4007 | else |
3823 | tmp = sg_div_cpu_power(sds->this, | 4008 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 4009 | sds->this->cpu_power; |
3825 | pwr_move += sds->this->__cpu_power * | 4010 | pwr_move += sds->this->cpu_power * |
3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 4011 | min(sds->this_load_per_task, sds->this_load + tmp); |
3827 | pwr_move /= SCHED_LOAD_SCALE; | 4012 | pwr_move /= SCHED_LOAD_SCALE; |
3828 | 4013 | ||
@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3857 | sds->max_load - sds->busiest_load_per_task); | 4042 | sds->max_load - sds->busiest_load_per_task); |
3858 | 4043 | ||
3859 | /* How much load to actually move to equalise the imbalance */ | 4044 | /* How much load to actually move to equalise the imbalance */ |
3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 4045 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 4046 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3862 | / SCHED_LOAD_SCALE; | 4047 | / SCHED_LOAD_SCALE; |
3863 | 4048 | ||
3864 | /* | 4049 | /* |
@@ -3976,6 +4161,26 @@ ret: | |||
3976 | return NULL; | 4161 | return NULL; |
3977 | } | 4162 | } |
3978 | 4163 | ||
4164 | static struct sched_group *group_of(int cpu) | ||
4165 | { | ||
4166 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
4167 | |||
4168 | if (!sd) | ||
4169 | return NULL; | ||
4170 | |||
4171 | return sd->groups; | ||
4172 | } | ||
4173 | |||
4174 | static unsigned long power_of(int cpu) | ||
4175 | { | ||
4176 | struct sched_group *group = group_of(cpu); | ||
4177 | |||
4178 | if (!group) | ||
4179 | return SCHED_LOAD_SCALE; | ||
4180 | |||
4181 | return group->cpu_power; | ||
4182 | } | ||
4183 | |||
3979 | /* | 4184 | /* |
3980 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4185 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
3981 | */ | 4186 | */ |
@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3988 | int i; | 4193 | int i; |
3989 | 4194 | ||
3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4195 | for_each_cpu(i, sched_group_cpus(group)) { |
4196 | unsigned long power = power_of(i); | ||
4197 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
3991 | unsigned long wl; | 4198 | unsigned long wl; |
3992 | 4199 | ||
3993 | if (!cpumask_test_cpu(i, cpus)) | 4200 | if (!cpumask_test_cpu(i, cpus)) |
3994 | continue; | 4201 | continue; |
3995 | 4202 | ||
3996 | rq = cpu_rq(i); | 4203 | rq = cpu_rq(i); |
3997 | wl = weighted_cpuload(i); | 4204 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
4205 | wl /= power; | ||
3998 | 4206 | ||
3999 | if (rq->nr_running == 1 && wl > imbalance) | 4207 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
4000 | continue; | 4208 | continue; |
4001 | 4209 | ||
4002 | if (wl > max_load) { | 4210 | if (wl > max_load) { |
@@ -5325,7 +5533,7 @@ need_resched: | |||
5325 | preempt_disable(); | 5533 | preempt_disable(); |
5326 | cpu = smp_processor_id(); | 5534 | cpu = smp_processor_id(); |
5327 | rq = cpu_rq(cpu); | 5535 | rq = cpu_rq(cpu); |
5328 | rcu_qsctr_inc(cpu); | 5536 | rcu_sched_qs(cpu); |
5329 | prev = rq->curr; | 5537 | prev = rq->curr; |
5330 | switch_count = &prev->nivcsw; | 5538 | switch_count = &prev->nivcsw; |
5331 | 5539 | ||
@@ -5349,10 +5557,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5557 | switch_count = &prev->nvcsw; |
5350 | } | 5558 | } |
5351 | 5559 | ||
5352 | #ifdef CONFIG_SMP | 5560 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5561 | ||
5357 | if (unlikely(!rq->nr_running)) | 5562 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5563 | idle_balance(cpu, rq); |
@@ -5378,6 +5583,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5583 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5584 | spin_unlock_irq(&rq->lock); |
5380 | 5585 | ||
5586 | post_schedule(rq); | ||
5587 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5588 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5589 | goto need_resched_nonpreemptible; |
5383 | 5590 | ||
@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6330 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6331 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6332 | struct rq *rq; |
6333 | int reset_on_fork; | ||
6126 | 6334 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6335 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6336 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6337 | recheck: |
6130 | /* double check policy once rq lock held */ | 6338 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6339 | if (policy < 0) { |
6340 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6341 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6342 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6343 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6344 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6345 | |
6346 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6347 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6348 | policy != SCHED_IDLE) | ||
6349 | return -EINVAL; | ||
6350 | } | ||
6351 | |||
6137 | /* | 6352 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6353 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6354 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6392,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6392 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6393 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6394 | return -EPERM; |
6395 | |||
6396 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6397 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6398 | return -EPERM; | ||
6180 | } | 6399 | } |
6181 | 6400 | ||
6182 | if (user) { | 6401 | if (user) { |
@@ -6220,6 +6439,8 @@ recheck: | |||
6220 | if (running) | 6439 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6440 | p->sched_class->put_prev_task(rq, p); |
6222 | 6441 | ||
6442 | p->sched_reset_on_fork = reset_on_fork; | ||
6443 | |||
6223 | oldprio = p->prio; | 6444 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6445 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6446 | ||
@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6557 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6558 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6559 | if (!retval) |
6339 | retval = p->policy; | 6560 | retval = p->policy |
6561 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6562 | } |
6341 | read_unlock(&tasklist_lock); | 6563 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6564 | return retval; |
6343 | } | 6565 | } |
6344 | 6566 | ||
6345 | /** | 6567 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6568 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6569 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6570 | * @param: structure containing the RT priority. |
6349 | */ | 6571 | */ |
@@ -6571,19 +6793,9 @@ static inline int should_resched(void) | |||
6571 | 6793 | ||
6572 | static void __cond_resched(void) | 6794 | static void __cond_resched(void) |
6573 | { | 6795 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6796 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6797 | schedule(); |
6576 | #endif | 6798 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6799 | } |
6588 | 6800 | ||
6589 | int __sched _cond_resched(void) | 6801 | int __sched _cond_resched(void) |
@@ -6597,18 +6809,20 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6809 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6810 | ||
6599 | /* | 6811 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6812 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6813 | * call schedule, and on return reacquire the lock. |
6602 | * | 6814 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6815 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6816 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6817 | * spin_unlock(), once by hand). |
6606 | */ | 6818 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6819 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6820 | { |
6609 | int resched = should_resched(); | 6821 | int resched = should_resched(); |
6610 | int ret = 0; | 6822 | int ret = 0; |
6611 | 6823 | ||
6824 | lockdep_assert_held(lock); | ||
6825 | |||
6612 | if (spin_needbreak(lock) || resched) { | 6826 | if (spin_needbreak(lock) || resched) { |
6613 | spin_unlock(lock); | 6827 | spin_unlock(lock); |
6614 | if (resched) | 6828 | if (resched) |
@@ -6620,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6620 | } | 6834 | } |
6621 | return ret; | 6835 | return ret; |
6622 | } | 6836 | } |
6623 | EXPORT_SYMBOL(cond_resched_lock); | 6837 | EXPORT_SYMBOL(__cond_resched_lock); |
6624 | 6838 | ||
6625 | int __sched cond_resched_softirq(void) | 6839 | int __sched __cond_resched_softirq(void) |
6626 | { | 6840 | { |
6627 | BUG_ON(!in_softirq()); | 6841 | BUG_ON(!in_softirq()); |
6628 | 6842 | ||
@@ -6634,7 +6848,7 @@ int __sched cond_resched_softirq(void) | |||
6634 | } | 6848 | } |
6635 | return 0; | 6849 | return 0; |
6636 | } | 6850 | } |
6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6851 | EXPORT_SYMBOL(__cond_resched_softirq); |
6638 | 6852 | ||
6639 | /** | 6853 | /** |
6640 | * yield - yield the current processor to other threads. | 6854 | * yield - yield the current processor to other threads. |
@@ -6658,11 +6872,13 @@ EXPORT_SYMBOL(yield); | |||
6658 | */ | 6872 | */ |
6659 | void __sched io_schedule(void) | 6873 | void __sched io_schedule(void) |
6660 | { | 6874 | { |
6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6875 | struct rq *rq = raw_rq(); |
6662 | 6876 | ||
6663 | delayacct_blkio_start(); | 6877 | delayacct_blkio_start(); |
6664 | atomic_inc(&rq->nr_iowait); | 6878 | atomic_inc(&rq->nr_iowait); |
6879 | current->in_iowait = 1; | ||
6665 | schedule(); | 6880 | schedule(); |
6881 | current->in_iowait = 0; | ||
6666 | atomic_dec(&rq->nr_iowait); | 6882 | atomic_dec(&rq->nr_iowait); |
6667 | delayacct_blkio_end(); | 6883 | delayacct_blkio_end(); |
6668 | } | 6884 | } |
@@ -6670,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6670 | 6886 | ||
6671 | long __sched io_schedule_timeout(long timeout) | 6887 | long __sched io_schedule_timeout(long timeout) |
6672 | { | 6888 | { |
6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6889 | struct rq *rq = raw_rq(); |
6674 | long ret; | 6890 | long ret; |
6675 | 6891 | ||
6676 | delayacct_blkio_start(); | 6892 | delayacct_blkio_start(); |
6677 | atomic_inc(&rq->nr_iowait); | 6893 | atomic_inc(&rq->nr_iowait); |
6894 | current->in_iowait = 1; | ||
6678 | ret = schedule_timeout(timeout); | 6895 | ret = schedule_timeout(timeout); |
6896 | current->in_iowait = 0; | ||
6679 | atomic_dec(&rq->nr_iowait); | 6897 | atomic_dec(&rq->nr_iowait); |
6680 | delayacct_blkio_end(); | 6898 | delayacct_blkio_end(); |
6681 | return ret; | 6899 | return ret; |
@@ -6992,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6992 | 7210 | ||
6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7211 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6994 | /* Need help from migration thread: drop lock and wait. */ | 7212 | /* Need help from migration thread: drop lock and wait. */ |
7213 | struct task_struct *mt = rq->migration_thread; | ||
7214 | |||
7215 | get_task_struct(mt); | ||
6995 | task_rq_unlock(rq, &flags); | 7216 | task_rq_unlock(rq, &flags); |
6996 | wake_up_process(rq->migration_thread); | 7217 | wake_up_process(rq->migration_thread); |
7218 | put_task_struct(mt); | ||
6997 | wait_for_completion(&req.done); | 7219 | wait_for_completion(&req.done); |
6998 | tlb_migrate_finish(p->mm); | 7220 | tlb_migrate_finish(p->mm); |
6999 | return 0; | 7221 | return 0; |
@@ -7051,6 +7273,11 @@ fail: | |||
7051 | return ret; | 7273 | return ret; |
7052 | } | 7274 | } |
7053 | 7275 | ||
7276 | #define RCU_MIGRATION_IDLE 0 | ||
7277 | #define RCU_MIGRATION_NEED_QS 1 | ||
7278 | #define RCU_MIGRATION_GOT_QS 2 | ||
7279 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
7280 | |||
7054 | /* | 7281 | /* |
7055 | * migration_thread - this is a highprio system thread that performs | 7282 | * migration_thread - this is a highprio system thread that performs |
7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7283 | * thread migration by bumping thread off CPU then 'pushing' onto |
@@ -7058,6 +7285,7 @@ fail: | |||
7058 | */ | 7285 | */ |
7059 | static int migration_thread(void *data) | 7286 | static int migration_thread(void *data) |
7060 | { | 7287 | { |
7288 | int badcpu; | ||
7061 | int cpu = (long)data; | 7289 | int cpu = (long)data; |
7062 | struct rq *rq; | 7290 | struct rq *rq; |
7063 | 7291 | ||
@@ -7092,8 +7320,17 @@ static int migration_thread(void *data) | |||
7092 | req = list_entry(head->next, struct migration_req, list); | 7320 | req = list_entry(head->next, struct migration_req, list); |
7093 | list_del_init(head->next); | 7321 | list_del_init(head->next); |
7094 | 7322 | ||
7095 | spin_unlock(&rq->lock); | 7323 | if (req->task != NULL) { |
7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7324 | spin_unlock(&rq->lock); |
7325 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
7326 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
7327 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
7328 | spin_unlock(&rq->lock); | ||
7329 | } else { | ||
7330 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
7331 | spin_unlock(&rq->lock); | ||
7332 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
7333 | } | ||
7097 | local_irq_enable(); | 7334 | local_irq_enable(); |
7098 | 7335 | ||
7099 | complete(&req->done); | 7336 | complete(&req->done); |
@@ -7625,7 +7862,7 @@ static int __init migration_init(void) | |||
7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7862 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7626 | register_cpu_notifier(&migration_notifier); | 7863 | register_cpu_notifier(&migration_notifier); |
7627 | 7864 | ||
7628 | return err; | 7865 | return 0; |
7629 | } | 7866 | } |
7630 | early_initcall(migration_init); | 7867 | early_initcall(migration_init); |
7631 | #endif | 7868 | #endif |
@@ -7672,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7672 | break; | 7909 | break; |
7673 | } | 7910 | } |
7674 | 7911 | ||
7675 | if (!group->__cpu_power) { | 7912 | if (!group->cpu_power) { |
7676 | printk(KERN_CONT "\n"); | 7913 | printk(KERN_CONT "\n"); |
7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7914 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
7678 | "set\n"); | 7915 | "set\n"); |
@@ -7696,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7933 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
7697 | 7934 | ||
7698 | printk(KERN_CONT " %s", str); | 7935 | printk(KERN_CONT " %s", str); |
7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7936 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7937 | printk(KERN_CONT " (cpu_power = %d)", |
7701 | group->__cpu_power); | 7938 | group->cpu_power); |
7702 | } | 7939 | } |
7703 | 7940 | ||
7704 | group = group->next; | 7941 | group = group->next; |
@@ -7841,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7841 | rq->rd = rd; | 8078 | rq->rd = rd; |
7842 | 8079 | ||
7843 | cpumask_set_cpu(rq->cpu, rd->span); | 8080 | cpumask_set_cpu(rq->cpu, rd->span); |
7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 8081 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7845 | set_rq_online(rq); | 8082 | set_rq_online(rq); |
7846 | 8083 | ||
7847 | spin_unlock_irqrestore(&rq->lock, flags); | 8084 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -7983,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
7983 | continue; | 8220 | continue; |
7984 | 8221 | ||
7985 | cpumask_clear(sched_group_cpus(sg)); | 8222 | cpumask_clear(sched_group_cpus(sg)); |
7986 | sg->__cpu_power = 0; | 8223 | sg->cpu_power = 0; |
7987 | 8224 | ||
7988 | for_each_cpu(j, span) { | 8225 | for_each_cpu(j, span) { |
7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8226 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
@@ -8091,6 +8328,39 @@ struct static_sched_domain { | |||
8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8328 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8092 | }; | 8329 | }; |
8093 | 8330 | ||
8331 | struct s_data { | ||
8332 | #ifdef CONFIG_NUMA | ||
8333 | int sd_allnodes; | ||
8334 | cpumask_var_t domainspan; | ||
8335 | cpumask_var_t covered; | ||
8336 | cpumask_var_t notcovered; | ||
8337 | #endif | ||
8338 | cpumask_var_t nodemask; | ||
8339 | cpumask_var_t this_sibling_map; | ||
8340 | cpumask_var_t this_core_map; | ||
8341 | cpumask_var_t send_covered; | ||
8342 | cpumask_var_t tmpmask; | ||
8343 | struct sched_group **sched_group_nodes; | ||
8344 | struct root_domain *rd; | ||
8345 | }; | ||
8346 | |||
8347 | enum s_alloc { | ||
8348 | sa_sched_groups = 0, | ||
8349 | sa_rootdomain, | ||
8350 | sa_tmpmask, | ||
8351 | sa_send_covered, | ||
8352 | sa_this_core_map, | ||
8353 | sa_this_sibling_map, | ||
8354 | sa_nodemask, | ||
8355 | sa_sched_group_nodes, | ||
8356 | #ifdef CONFIG_NUMA | ||
8357 | sa_notcovered, | ||
8358 | sa_covered, | ||
8359 | sa_domainspan, | ||
8360 | #endif | ||
8361 | sa_none, | ||
8362 | }; | ||
8363 | |||
8094 | /* | 8364 | /* |
8095 | * SMT sched-domains: | 8365 | * SMT sched-domains: |
8096 | */ | 8366 | */ |
@@ -8208,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8208 | continue; | 8478 | continue; |
8209 | } | 8479 | } |
8210 | 8480 | ||
8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8481 | sg->cpu_power += sd->groups->cpu_power; |
8212 | } | 8482 | } |
8213 | sg = sg->next; | 8483 | sg = sg->next; |
8214 | } while (sg != group_head); | 8484 | } while (sg != group_head); |
8215 | } | 8485 | } |
8486 | |||
8487 | static int build_numa_sched_groups(struct s_data *d, | ||
8488 | const struct cpumask *cpu_map, int num) | ||
8489 | { | ||
8490 | struct sched_domain *sd; | ||
8491 | struct sched_group *sg, *prev; | ||
8492 | int n, j; | ||
8493 | |||
8494 | cpumask_clear(d->covered); | ||
8495 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8496 | if (cpumask_empty(d->nodemask)) { | ||
8497 | d->sched_group_nodes[num] = NULL; | ||
8498 | goto out; | ||
8499 | } | ||
8500 | |||
8501 | sched_domain_node_span(num, d->domainspan); | ||
8502 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8503 | |||
8504 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8505 | GFP_KERNEL, num); | ||
8506 | if (!sg) { | ||
8507 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8508 | num); | ||
8509 | return -ENOMEM; | ||
8510 | } | ||
8511 | d->sched_group_nodes[num] = sg; | ||
8512 | |||
8513 | for_each_cpu(j, d->nodemask) { | ||
8514 | sd = &per_cpu(node_domains, j).sd; | ||
8515 | sd->groups = sg; | ||
8516 | } | ||
8517 | |||
8518 | sg->cpu_power = 0; | ||
8519 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8520 | sg->next = sg; | ||
8521 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8522 | |||
8523 | prev = sg; | ||
8524 | for (j = 0; j < nr_node_ids; j++) { | ||
8525 | n = (num + j) % nr_node_ids; | ||
8526 | cpumask_complement(d->notcovered, d->covered); | ||
8527 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8528 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8529 | if (cpumask_empty(d->tmpmask)) | ||
8530 | break; | ||
8531 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8532 | if (cpumask_empty(d->tmpmask)) | ||
8533 | continue; | ||
8534 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8535 | GFP_KERNEL, num); | ||
8536 | if (!sg) { | ||
8537 | printk(KERN_WARNING | ||
8538 | "Can not alloc domain group for node %d\n", j); | ||
8539 | return -ENOMEM; | ||
8540 | } | ||
8541 | sg->cpu_power = 0; | ||
8542 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8543 | sg->next = prev->next; | ||
8544 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8545 | prev->next = sg; | ||
8546 | prev = sg; | ||
8547 | } | ||
8548 | out: | ||
8549 | return 0; | ||
8550 | } | ||
8216 | #endif /* CONFIG_NUMA */ | 8551 | #endif /* CONFIG_NUMA */ |
8217 | 8552 | ||
8218 | #ifdef CONFIG_NUMA | 8553 | #ifdef CONFIG_NUMA |
@@ -8266,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8601 | * there are asymmetries in the topology. If there are asymmetries, group |
8267 | * having more cpu_power will pickup more load compared to the group having | 8602 | * having more cpu_power will pickup more load compared to the group having |
8268 | * less cpu_power. | 8603 | * less cpu_power. |
8269 | * | ||
8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
8272 | * or lightly loaded groups in the same sched domain. | ||
8273 | */ | 8604 | */ |
8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8605 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
8275 | { | 8606 | { |
8276 | struct sched_domain *child; | 8607 | struct sched_domain *child; |
8277 | struct sched_group *group; | 8608 | struct sched_group *group; |
8609 | long power; | ||
8610 | int weight; | ||
8278 | 8611 | ||
8279 | WARN_ON(!sd || !sd->groups); | 8612 | WARN_ON(!sd || !sd->groups); |
8280 | 8613 | ||
@@ -8283,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
8283 | 8616 | ||
8284 | child = sd->child; | 8617 | child = sd->child; |
8285 | 8618 | ||
8286 | sd->groups->__cpu_power = 0; | 8619 | sd->groups->cpu_power = 0; |
8287 | 8620 | ||
8288 | /* | 8621 | if (!child) { |
8289 | * For perf policy, if the groups in child domain share resources | 8622 | power = SCHED_LOAD_SCALE; |
8290 | * (for example cores sharing some portions of the cache hierarchy | 8623 | weight = cpumask_weight(sched_domain_span(sd)); |
8291 | * or SMT), then set this domain groups cpu_power such that each group | 8624 | /* |
8292 | * can handle only one task, when there are other idle groups in the | 8625 | * SMT siblings share the power of a single core. |
8293 | * same sched domain. | 8626 | * Usually multiple threads get a better yield out of |
8294 | */ | 8627 | * that one core than a single thread would have, |
8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8628 | * reflect that in sd->smt_gain. |
8296 | (child->flags & | 8629 | */ |
8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8630 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8631 | power *= sd->smt_gain; |
8632 | power /= weight; | ||
8633 | power >>= SCHED_LOAD_SHIFT; | ||
8634 | } | ||
8635 | sd->groups->cpu_power += power; | ||
8299 | return; | 8636 | return; |
8300 | } | 8637 | } |
8301 | 8638 | ||
8302 | /* | 8639 | /* |
8303 | * add cpu_power of each child group to this groups cpu_power | 8640 | * Add cpu_power of each child group to this groups cpu_power. |
8304 | */ | 8641 | */ |
8305 | group = child->groups; | 8642 | group = child->groups; |
8306 | do { | 8643 | do { |
8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8644 | sd->groups->cpu_power += group->cpu_power; |
8308 | group = group->next; | 8645 | group = group->next; |
8309 | } while (group != child->groups); | 8646 | } while (group != child->groups); |
8310 | } | 8647 | } |
@@ -8378,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8378 | } | 8715 | } |
8379 | } | 8716 | } |
8380 | 8717 | ||
8381 | /* | 8718 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8719 | const struct cpumask *cpu_map) |
8383 | * to the individual cpus | 8720 | { |
8384 | */ | 8721 | switch (what) { |
8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | 8722 | case sa_sched_groups: |
8386 | struct sched_domain_attr *attr) | 8723 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ |
8387 | { | 8724 | d->sched_group_nodes = NULL; |
8388 | int i, err = -ENOMEM; | 8725 | case sa_rootdomain: |
8389 | struct root_domain *rd; | 8726 | free_rootdomain(d->rd); /* fall through */ |
8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | 8727 | case sa_tmpmask: |
8391 | tmpmask; | 8728 | free_cpumask_var(d->tmpmask); /* fall through */ |
8729 | case sa_send_covered: | ||
8730 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8731 | case sa_this_core_map: | ||
8732 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8733 | case sa_this_sibling_map: | ||
8734 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8735 | case sa_nodemask: | ||
8736 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8737 | case sa_sched_group_nodes: | ||
8392 | #ifdef CONFIG_NUMA | 8738 | #ifdef CONFIG_NUMA |
8393 | cpumask_var_t domainspan, covered, notcovered; | 8739 | kfree(d->sched_group_nodes); /* fall through */ |
8394 | struct sched_group **sched_group_nodes = NULL; | 8740 | case sa_notcovered: |
8395 | int sd_allnodes = 0; | 8741 | free_cpumask_var(d->notcovered); /* fall through */ |
8396 | 8742 | case sa_covered: | |
8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8743 | free_cpumask_var(d->covered); /* fall through */ |
8398 | goto out; | 8744 | case sa_domainspan: |
8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8745 | free_cpumask_var(d->domainspan); /* fall through */ |
8400 | goto free_domainspan; | 8746 | #endif |
8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8747 | case sa_none: |
8402 | goto free_covered; | 8748 | break; |
8403 | #endif | 8749 | } |
8404 | 8750 | } | |
8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8406 | goto free_notcovered; | ||
8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8408 | goto free_nodemask; | ||
8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8410 | goto free_this_sibling_map; | ||
8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8412 | goto free_this_core_map; | ||
8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8414 | goto free_send_covered; | ||
8415 | 8751 | ||
8752 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
8753 | const struct cpumask *cpu_map) | ||
8754 | { | ||
8416 | #ifdef CONFIG_NUMA | 8755 | #ifdef CONFIG_NUMA |
8417 | /* | 8756 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8418 | * Allocate the per-node list of sched groups | 8757 | return sa_none; |
8419 | */ | 8758 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8759 | return sa_domainspan; |
8421 | GFP_KERNEL); | 8760 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8422 | if (!sched_group_nodes) { | 8761 | return sa_covered; |
8762 | /* Allocate the per-node list of sched groups */ | ||
8763 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
8764 | sizeof(struct sched_group *), GFP_KERNEL); | ||
8765 | if (!d->sched_group_nodes) { | ||
8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8766 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8424 | goto free_tmpmask; | 8767 | return sa_notcovered; |
8425 | } | 8768 | } |
8426 | #endif | 8769 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8427 | 8770 | #endif | |
8428 | rd = alloc_rootdomain(); | 8771 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8429 | if (!rd) { | 8772 | return sa_sched_group_nodes; |
8773 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8774 | return sa_nodemask; | ||
8775 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8776 | return sa_this_sibling_map; | ||
8777 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8778 | return sa_this_core_map; | ||
8779 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8780 | return sa_send_covered; | ||
8781 | d->rd = alloc_rootdomain(); | ||
8782 | if (!d->rd) { | ||
8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8783 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8431 | goto free_sched_groups; | 8784 | return sa_tmpmask; |
8432 | } | 8785 | } |
8786 | return sa_rootdomain; | ||
8787 | } | ||
8433 | 8788 | ||
8789 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8790 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8791 | { | ||
8792 | struct sched_domain *sd = NULL; | ||
8434 | #ifdef CONFIG_NUMA | 8793 | #ifdef CONFIG_NUMA |
8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8794 | struct sched_domain *parent; |
8436 | #endif | ||
8437 | |||
8438 | /* | ||
8439 | * Set up domains for cpus specified by the cpu_map. | ||
8440 | */ | ||
8441 | for_each_cpu(i, cpu_map) { | ||
8442 | struct sched_domain *sd = NULL, *p; | ||
8443 | |||
8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
8445 | |||
8446 | #ifdef CONFIG_NUMA | ||
8447 | if (cpumask_weight(cpu_map) > | ||
8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8450 | SD_INIT(sd, ALLNODES); | ||
8451 | set_domain_attribute(sd, attr); | ||
8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8454 | p = sd; | ||
8455 | sd_allnodes = 1; | ||
8456 | } else | ||
8457 | p = NULL; | ||
8458 | 8795 | ||
8459 | sd = &per_cpu(node_domains, i).sd; | 8796 | d->sd_allnodes = 0; |
8460 | SD_INIT(sd, NODE); | 8797 | if (cpumask_weight(cpu_map) > |
8798 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
8799 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8800 | SD_INIT(sd, ALLNODES); | ||
8461 | set_domain_attribute(sd, attr); | 8801 | set_domain_attribute(sd, attr); |
8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8802 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8463 | sd->parent = p; | 8803 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8464 | if (p) | 8804 | d->sd_allnodes = 1; |
8465 | p->child = sd; | 8805 | } |
8466 | cpumask_and(sched_domain_span(sd), | 8806 | parent = sd; |
8467 | sched_domain_span(sd), cpu_map); | 8807 | |
8808 | sd = &per_cpu(node_domains, i).sd; | ||
8809 | SD_INIT(sd, NODE); | ||
8810 | set_domain_attribute(sd, attr); | ||
8811 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8812 | sd->parent = parent; | ||
8813 | if (parent) | ||
8814 | parent->child = sd; | ||
8815 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8468 | #endif | 8816 | #endif |
8817 | return sd; | ||
8818 | } | ||
8469 | 8819 | ||
8470 | p = sd; | 8820 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8471 | sd = &per_cpu(phys_domains, i).sd; | 8821 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8472 | SD_INIT(sd, CPU); | 8822 | struct sched_domain *parent, int i) |
8473 | set_domain_attribute(sd, attr); | 8823 | { |
8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8824 | struct sched_domain *sd; |
8475 | sd->parent = p; | 8825 | sd = &per_cpu(phys_domains, i).sd; |
8476 | if (p) | 8826 | SD_INIT(sd, CPU); |
8477 | p->child = sd; | 8827 | set_domain_attribute(sd, attr); |
8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8828 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8829 | sd->parent = parent; | ||
8830 | if (parent) | ||
8831 | parent->child = sd; | ||
8832 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8833 | return sd; | ||
8834 | } | ||
8479 | 8835 | ||
8836 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8837 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8838 | struct sched_domain *parent, int i) | ||
8839 | { | ||
8840 | struct sched_domain *sd = parent; | ||
8480 | #ifdef CONFIG_SCHED_MC | 8841 | #ifdef CONFIG_SCHED_MC |
8481 | p = sd; | 8842 | sd = &per_cpu(core_domains, i).sd; |
8482 | sd = &per_cpu(core_domains, i).sd; | 8843 | SD_INIT(sd, MC); |
8483 | SD_INIT(sd, MC); | 8844 | set_domain_attribute(sd, attr); |
8484 | set_domain_attribute(sd, attr); | 8845 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8846 | sd->parent = parent; |
8486 | cpu_coregroup_mask(i)); | 8847 | parent->child = sd; |
8487 | sd->parent = p; | 8848 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8488 | p->child = sd; | ||
8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8490 | #endif | 8849 | #endif |
8850 | return sd; | ||
8851 | } | ||
8491 | 8852 | ||
8853 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8854 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8855 | struct sched_domain *parent, int i) | ||
8856 | { | ||
8857 | struct sched_domain *sd = parent; | ||
8492 | #ifdef CONFIG_SCHED_SMT | 8858 | #ifdef CONFIG_SCHED_SMT |
8493 | p = sd; | 8859 | sd = &per_cpu(cpu_domains, i).sd; |
8494 | sd = &per_cpu(cpu_domains, i).sd; | 8860 | SD_INIT(sd, SIBLING); |
8495 | SD_INIT(sd, SIBLING); | 8861 | set_domain_attribute(sd, attr); |
8496 | set_domain_attribute(sd, attr); | 8862 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8497 | cpumask_and(sched_domain_span(sd), | 8863 | sd->parent = parent; |
8498 | topology_thread_cpumask(i), cpu_map); | 8864 | parent->child = sd; |
8499 | sd->parent = p; | 8865 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8500 | p->child = sd; | ||
8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8502 | #endif | 8866 | #endif |
8503 | } | 8867 | return sd; |
8868 | } | ||
8504 | 8869 | ||
8870 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8871 | const struct cpumask *cpu_map, int cpu) | ||
8872 | { | ||
8873 | switch (l) { | ||
8505 | #ifdef CONFIG_SCHED_SMT | 8874 | #ifdef CONFIG_SCHED_SMT |
8506 | /* Set up CPU (sibling) groups */ | 8875 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8507 | for_each_cpu(i, cpu_map) { | 8876 | cpumask_and(d->this_sibling_map, cpu_map, |
8508 | cpumask_and(this_sibling_map, | 8877 | topology_thread_cpumask(cpu)); |
8509 | topology_thread_cpumask(i), cpu_map); | 8878 | if (cpu == cpumask_first(d->this_sibling_map)) |
8510 | if (i != cpumask_first(this_sibling_map)) | 8879 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8511 | continue; | 8880 | &cpu_to_cpu_group, |
8512 | 8881 | d->send_covered, d->tmpmask); | |
8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8882 | break; |
8514 | &cpu_to_cpu_group, | ||
8515 | send_covered, tmpmask); | ||
8516 | } | ||
8517 | #endif | 8883 | #endif |
8518 | |||
8519 | #ifdef CONFIG_SCHED_MC | 8884 | #ifdef CONFIG_SCHED_MC |
8520 | /* Set up multi-core groups */ | 8885 | case SD_LV_MC: /* set up multi-core groups */ |
8521 | for_each_cpu(i, cpu_map) { | 8886 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8887 | if (cpu == cpumask_first(d->this_core_map)) |
8523 | if (i != cpumask_first(this_core_map)) | 8888 | init_sched_build_groups(d->this_core_map, cpu_map, |
8524 | continue; | 8889 | &cpu_to_core_group, |
8525 | 8890 | d->send_covered, d->tmpmask); | |
8526 | init_sched_build_groups(this_core_map, cpu_map, | 8891 | break; |
8527 | &cpu_to_core_group, | ||
8528 | send_covered, tmpmask); | ||
8529 | } | ||
8530 | #endif | 8892 | #endif |
8531 | 8893 | case SD_LV_CPU: /* set up physical groups */ | |
8532 | /* Set up physical groups */ | 8894 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8533 | for (i = 0; i < nr_node_ids; i++) { | 8895 | if (!cpumask_empty(d->nodemask)) |
8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8896 | init_sched_build_groups(d->nodemask, cpu_map, |
8535 | if (cpumask_empty(nodemask)) | 8897 | &cpu_to_phys_group, |
8536 | continue; | 8898 | d->send_covered, d->tmpmask); |
8537 | 8899 | break; | |
8538 | init_sched_build_groups(nodemask, cpu_map, | ||
8539 | &cpu_to_phys_group, | ||
8540 | send_covered, tmpmask); | ||
8541 | } | ||
8542 | |||
8543 | #ifdef CONFIG_NUMA | 8900 | #ifdef CONFIG_NUMA |
8544 | /* Set up node groups */ | 8901 | case SD_LV_ALLNODES: |
8545 | if (sd_allnodes) { | 8902 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8546 | init_sched_build_groups(cpu_map, cpu_map, | 8903 | d->send_covered, d->tmpmask); |
8547 | &cpu_to_allnodes_group, | 8904 | break; |
8548 | send_covered, tmpmask); | 8905 | #endif |
8906 | default: | ||
8907 | break; | ||
8549 | } | 8908 | } |
8909 | } | ||
8550 | 8910 | ||
8551 | for (i = 0; i < nr_node_ids; i++) { | 8911 | /* |
8552 | /* Set up node groups */ | 8912 | * Build sched domains for a given set of cpus and attach the sched domains |
8553 | struct sched_group *sg, *prev; | 8913 | * to the individual cpus |
8554 | int j; | 8914 | */ |
8555 | 8915 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8556 | cpumask_clear(covered); | 8916 | struct sched_domain_attr *attr) |
8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8917 | { |
8558 | if (cpumask_empty(nodemask)) { | 8918 | enum s_alloc alloc_state = sa_none; |
8559 | sched_group_nodes[i] = NULL; | 8919 | struct s_data d; |
8560 | continue; | 8920 | struct sched_domain *sd; |
8561 | } | 8921 | int i; |
8922 | #ifdef CONFIG_NUMA | ||
8923 | d.sd_allnodes = 0; | ||
8924 | #endif | ||
8562 | 8925 | ||
8563 | sched_domain_node_span(i, domainspan); | 8926 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8927 | if (alloc_state != sa_rootdomain) |
8928 | goto error; | ||
8929 | alloc_state = sa_sched_groups; | ||
8565 | 8930 | ||
8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8931 | /* |
8567 | GFP_KERNEL, i); | 8932 | * Set up domains for cpus specified by the cpu_map. |
8568 | if (!sg) { | 8933 | */ |
8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8934 | for_each_cpu(i, cpu_map) { |
8570 | "node %d\n", i); | 8935 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8571 | goto error; | 8936 | cpu_map); |
8572 | } | ||
8573 | sched_group_nodes[i] = sg; | ||
8574 | for_each_cpu(j, nodemask) { | ||
8575 | struct sched_domain *sd; | ||
8576 | 8937 | ||
8577 | sd = &per_cpu(node_domains, j).sd; | 8938 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8578 | sd->groups = sg; | 8939 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8579 | } | 8940 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8580 | sg->__cpu_power = 0; | 8941 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8942 | } |
8582 | sg->next = sg; | ||
8583 | cpumask_or(covered, covered, nodemask); | ||
8584 | prev = sg; | ||
8585 | 8943 | ||
8586 | for (j = 0; j < nr_node_ids; j++) { | 8944 | for_each_cpu(i, cpu_map) { |
8587 | int n = (i + j) % nr_node_ids; | 8945 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8946 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8947 | } | ||
8588 | 8948 | ||
8589 | cpumask_complement(notcovered, covered); | 8949 | /* Set up physical groups */ |
8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8950 | for (i = 0; i < nr_node_ids; i++) |
8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8951 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8592 | if (cpumask_empty(tmpmask)) | ||
8593 | break; | ||
8594 | 8952 | ||
8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8953 | #ifdef CONFIG_NUMA |
8596 | if (cpumask_empty(tmpmask)) | 8954 | /* Set up node groups */ |
8597 | continue; | 8955 | if (d.sd_allnodes) |
8956 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8598 | 8957 | ||
8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8958 | for (i = 0; i < nr_node_ids; i++) |
8600 | cpumask_size(), | 8959 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8601 | GFP_KERNEL, i); | 8960 | goto error; |
8602 | if (!sg) { | ||
8603 | printk(KERN_WARNING | ||
8604 | "Can not alloc domain group for node %d\n", j); | ||
8605 | goto error; | ||
8606 | } | ||
8607 | sg->__cpu_power = 0; | ||
8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8609 | sg->next = prev->next; | ||
8610 | cpumask_or(covered, covered, tmpmask); | ||
8611 | prev->next = sg; | ||
8612 | prev = sg; | ||
8613 | } | ||
8614 | } | ||
8615 | #endif | 8961 | #endif |
8616 | 8962 | ||
8617 | /* Calculate CPU power for physical packages and nodes */ | 8963 | /* Calculate CPU power for physical packages and nodes */ |
8618 | #ifdef CONFIG_SCHED_SMT | 8964 | #ifdef CONFIG_SCHED_SMT |
8619 | for_each_cpu(i, cpu_map) { | 8965 | for_each_cpu(i, cpu_map) { |
8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8966 | sd = &per_cpu(cpu_domains, i).sd; |
8621 | |||
8622 | init_sched_groups_power(i, sd); | 8967 | init_sched_groups_power(i, sd); |
8623 | } | 8968 | } |
8624 | #endif | 8969 | #endif |
8625 | #ifdef CONFIG_SCHED_MC | 8970 | #ifdef CONFIG_SCHED_MC |
8626 | for_each_cpu(i, cpu_map) { | 8971 | for_each_cpu(i, cpu_map) { |
8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8972 | sd = &per_cpu(core_domains, i).sd; |
8628 | |||
8629 | init_sched_groups_power(i, sd); | 8973 | init_sched_groups_power(i, sd); |
8630 | } | 8974 | } |
8631 | #endif | 8975 | #endif |
8632 | 8976 | ||
8633 | for_each_cpu(i, cpu_map) { | 8977 | for_each_cpu(i, cpu_map) { |
8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8978 | sd = &per_cpu(phys_domains, i).sd; |
8635 | |||
8636 | init_sched_groups_power(i, sd); | 8979 | init_sched_groups_power(i, sd); |
8637 | } | 8980 | } |
8638 | 8981 | ||
8639 | #ifdef CONFIG_NUMA | 8982 | #ifdef CONFIG_NUMA |
8640 | for (i = 0; i < nr_node_ids; i++) | 8983 | for (i = 0; i < nr_node_ids; i++) |
8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8984 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8642 | 8985 | ||
8643 | if (sd_allnodes) { | 8986 | if (d.sd_allnodes) { |
8644 | struct sched_group *sg; | 8987 | struct sched_group *sg; |
8645 | 8988 | ||
8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8989 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8647 | tmpmask); | 8990 | d.tmpmask); |
8648 | init_numa_sched_groups_power(sg); | 8991 | init_numa_sched_groups_power(sg); |
8649 | } | 8992 | } |
8650 | #endif | 8993 | #endif |
8651 | 8994 | ||
8652 | /* Attach the domains */ | 8995 | /* Attach the domains */ |
8653 | for_each_cpu(i, cpu_map) { | 8996 | for_each_cpu(i, cpu_map) { |
8654 | struct sched_domain *sd; | ||
8655 | #ifdef CONFIG_SCHED_SMT | 8997 | #ifdef CONFIG_SCHED_SMT |
8656 | sd = &per_cpu(cpu_domains, i).sd; | 8998 | sd = &per_cpu(cpu_domains, i).sd; |
8657 | #elif defined(CONFIG_SCHED_MC) | 8999 | #elif defined(CONFIG_SCHED_MC) |
@@ -8659,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8659 | #else | 9001 | #else |
8660 | sd = &per_cpu(phys_domains, i).sd; | 9002 | sd = &per_cpu(phys_domains, i).sd; |
8661 | #endif | 9003 | #endif |
8662 | cpu_attach_domain(sd, rd, i); | 9004 | cpu_attach_domain(sd, d.rd, i); |
8663 | } | 9005 | } |
8664 | 9006 | ||
8665 | err = 0; | 9007 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8666 | 9008 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8667 | free_tmpmask: | 9009 | return 0; |
8668 | free_cpumask_var(tmpmask); | ||
8669 | free_send_covered: | ||
8670 | free_cpumask_var(send_covered); | ||
8671 | free_this_core_map: | ||
8672 | free_cpumask_var(this_core_map); | ||
8673 | free_this_sibling_map: | ||
8674 | free_cpumask_var(this_sibling_map); | ||
8675 | free_nodemask: | ||
8676 | free_cpumask_var(nodemask); | ||
8677 | free_notcovered: | ||
8678 | #ifdef CONFIG_NUMA | ||
8679 | free_cpumask_var(notcovered); | ||
8680 | free_covered: | ||
8681 | free_cpumask_var(covered); | ||
8682 | free_domainspan: | ||
8683 | free_cpumask_var(domainspan); | ||
8684 | out: | ||
8685 | #endif | ||
8686 | return err; | ||
8687 | |||
8688 | free_sched_groups: | ||
8689 | #ifdef CONFIG_NUMA | ||
8690 | kfree(sched_group_nodes); | ||
8691 | #endif | ||
8692 | goto free_tmpmask; | ||
8693 | 9010 | ||
8694 | #ifdef CONFIG_NUMA | ||
8695 | error: | 9011 | error: |
8696 | free_sched_groups(cpu_map, tmpmask); | 9012 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8697 | free_rootdomain(rd); | 9013 | return -ENOMEM; |
8698 | goto free_tmpmask; | ||
8699 | #endif | ||
8700 | } | 9014 | } |
8701 | 9015 | ||
8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 9016 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9304,11 +9618,11 @@ void __init sched_init(void) | |||
9304 | * system cpu resource, based on the weight assigned to root | 9618 | * system cpu resource, based on the weight assigned to root |
9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9619 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9620 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9307 | * (init_cfs_rq) and having one entity represent this group of | 9621 | * (init_tg_cfs_rq) and having one entity represent this group of |
9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9622 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9309 | */ | 9623 | */ |
9310 | init_tg_cfs_entry(&init_task_group, | 9624 | init_tg_cfs_entry(&init_task_group, |
9311 | &per_cpu(init_cfs_rq, i), | 9625 | &per_cpu(init_tg_cfs_rq, i), |
9312 | &per_cpu(init_sched_entity, i), i, 1, | 9626 | &per_cpu(init_sched_entity, i), i, 1, |
9313 | root_task_group.se[i]); | 9627 | root_task_group.se[i]); |
9314 | 9628 | ||
@@ -9334,6 +9648,7 @@ void __init sched_init(void) | |||
9334 | #ifdef CONFIG_SMP | 9648 | #ifdef CONFIG_SMP |
9335 | rq->sd = NULL; | 9649 | rq->sd = NULL; |
9336 | rq->rd = NULL; | 9650 | rq->rd = NULL; |
9651 | rq->post_schedule = 0; | ||
9337 | rq->active_balance = 0; | 9652 | rq->active_balance = 0; |
9338 | rq->next_balance = jiffies; | 9653 | rq->next_balance = jiffies; |
9339 | rq->push_cpu = 0; | 9654 | rq->push_cpu = 0; |
@@ -9398,13 +9713,20 @@ void __init sched_init(void) | |||
9398 | } | 9713 | } |
9399 | 9714 | ||
9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9715 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9401 | void __might_sleep(char *file, int line) | 9716 | static inline int preempt_count_equals(int preempt_offset) |
9717 | { | ||
9718 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9719 | |||
9720 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9721 | } | ||
9722 | |||
9723 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9402 | { | 9724 | { |
9403 | #ifdef in_atomic | 9725 | #ifdef in_atomic |
9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9726 | static unsigned long prev_jiffy; /* ratelimiting */ |
9405 | 9727 | ||
9406 | if ((!in_atomic() && !irqs_disabled()) || | 9728 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9729 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9408 | return; | 9730 | return; |
9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9731 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9410 | return; | 9732 | return; |
@@ -10581,3 +10903,113 @@ struct cgroup_subsys cpuacct_subsys = { | |||
10581 | .subsys_id = cpuacct_subsys_id, | 10903 | .subsys_id = cpuacct_subsys_id, |
10582 | }; | 10904 | }; |
10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10905 | #endif /* CONFIG_CGROUP_CPUACCT */ |
10906 | |||
10907 | #ifndef CONFIG_SMP | ||
10908 | |||
10909 | int rcu_expedited_torture_stats(char *page) | ||
10910 | { | ||
10911 | return 0; | ||
10912 | } | ||
10913 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10914 | |||
10915 | void synchronize_sched_expedited(void) | ||
10916 | { | ||
10917 | } | ||
10918 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10919 | |||
10920 | #else /* #ifndef CONFIG_SMP */ | ||
10921 | |||
10922 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
10923 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
10924 | |||
10925 | #define RCU_EXPEDITED_STATE_POST -2 | ||
10926 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
10927 | |||
10928 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10929 | |||
10930 | int rcu_expedited_torture_stats(char *page) | ||
10931 | { | ||
10932 | int cnt = 0; | ||
10933 | int cpu; | ||
10934 | |||
10935 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
10936 | for_each_online_cpu(cpu) { | ||
10937 | cnt += sprintf(&page[cnt], " %d:%d", | ||
10938 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
10939 | } | ||
10940 | cnt += sprintf(&page[cnt], "\n"); | ||
10941 | return cnt; | ||
10942 | } | ||
10943 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10944 | |||
10945 | static long synchronize_sched_expedited_count; | ||
10946 | |||
10947 | /* | ||
10948 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
10949 | * approach to force grace period to end quickly. This consumes | ||
10950 | * significant time on all CPUs, and is thus not recommended for | ||
10951 | * any sort of common-case code. | ||
10952 | * | ||
10953 | * Note that it is illegal to call this function while holding any | ||
10954 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
10955 | * observe this restriction will result in deadlock. | ||
10956 | */ | ||
10957 | void synchronize_sched_expedited(void) | ||
10958 | { | ||
10959 | int cpu; | ||
10960 | unsigned long flags; | ||
10961 | bool need_full_sync = 0; | ||
10962 | struct rq *rq; | ||
10963 | struct migration_req *req; | ||
10964 | long snap; | ||
10965 | int trycount = 0; | ||
10966 | |||
10967 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
10968 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
10969 | get_online_cpus(); | ||
10970 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
10971 | put_online_cpus(); | ||
10972 | if (trycount++ < 10) | ||
10973 | udelay(trycount * num_online_cpus()); | ||
10974 | else { | ||
10975 | synchronize_sched(); | ||
10976 | return; | ||
10977 | } | ||
10978 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
10979 | smp_mb(); /* ensure test happens before caller kfree */ | ||
10980 | return; | ||
10981 | } | ||
10982 | get_online_cpus(); | ||
10983 | } | ||
10984 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
10985 | for_each_online_cpu(cpu) { | ||
10986 | rq = cpu_rq(cpu); | ||
10987 | req = &per_cpu(rcu_migration_req, cpu); | ||
10988 | init_completion(&req->done); | ||
10989 | req->task = NULL; | ||
10990 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
10991 | spin_lock_irqsave(&rq->lock, flags); | ||
10992 | list_add(&req->list, &rq->migration_queue); | ||
10993 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10994 | wake_up_process(rq->migration_thread); | ||
10995 | } | ||
10996 | for_each_online_cpu(cpu) { | ||
10997 | rcu_expedited_state = cpu; | ||
10998 | req = &per_cpu(rcu_migration_req, cpu); | ||
10999 | rq = cpu_rq(cpu); | ||
11000 | wait_for_completion(&req->done); | ||
11001 | spin_lock_irqsave(&rq->lock, flags); | ||
11002 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
11003 | need_full_sync = 1; | ||
11004 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
11005 | spin_unlock_irqrestore(&rq->lock, flags); | ||
11006 | } | ||
11007 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
11008 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
11009 | put_online_cpus(); | ||
11010 | if (need_full_sync) | ||
11011 | synchronize_sched(); | ||
11012 | } | ||
11013 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
11014 | |||
11015 | #endif /* #else #ifndef CONFIG_SMP */ | ||