diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 823 |
1 files changed, 503 insertions, 320 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273..da1edc8277d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 309 | ||
310 | /* | 310 | /* |
311 | * Root task group. | 311 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 312 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 313 | * be a child to this group. |
314 | */ | 314 | */ |
315 | struct task_group root_task_group; | 315 | struct task_group root_task_group; |
316 | 316 | ||
@@ -318,7 +318,7 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 318 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 320 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 321 | static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 323 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 324 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -616,6 +616,7 @@ struct rq { | |||
616 | 616 | ||
617 | unsigned char idle_at_tick; | 617 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 618 | /* For active balancing */ |
619 | int post_schedule; | ||
619 | int active_balance; | 620 | int active_balance; |
620 | int push_cpu; | 621 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 622 | /* cpu of this runqueue: */ |
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 694 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 695 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 696 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
697 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 698 | ||
697 | inline void update_rq_clock(struct rq *rq) | 699 | inline void update_rq_clock(struct rq *rq) |
698 | { | 700 | { |
@@ -1513,28 +1515,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1515 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1516 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1517 | ||
1518 | struct update_shares_data { | ||
1519 | unsigned long rq_weight[NR_CPUS]; | ||
1520 | }; | ||
1521 | |||
1522 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1523 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1524 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1525 | ||
1518 | /* | 1526 | /* |
1519 | * Calculate and set the cpu's group shares. | 1527 | * Calculate and set the cpu's group shares. |
1520 | */ | 1528 | */ |
1521 | static void | 1529 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1530 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1531 | unsigned long sd_rq_weight, |
1532 | struct update_shares_data *usd) | ||
1524 | { | 1533 | { |
1525 | unsigned long shares; | 1534 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1535 | int boost = 0; |
1527 | |||
1528 | if (!tg->se[cpu]) | ||
1529 | return; | ||
1530 | 1536 | ||
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1537 | rq_weight = usd->rq_weight[cpu]; |
1538 | if (!rq_weight) { | ||
1539 | boost = 1; | ||
1540 | rq_weight = NICE_0_LOAD; | ||
1541 | } | ||
1532 | 1542 | ||
1533 | /* | 1543 | /* |
1534 | * \Sum shares * rq_weight | 1544 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1545 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1546 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1547 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1548 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1549 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1554,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1554 | unsigned long flags; |
1546 | 1555 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1556 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1557 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1558 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1559 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1560 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1561 | } |
@@ -1559,22 +1568,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1568 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1569 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1570 | { |
1562 | unsigned long weight, rq_weight = 0; | 1571 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1572 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1573 | struct sched_domain *sd = data; |
1574 | unsigned long flags; | ||
1565 | int i; | 1575 | int i; |
1566 | 1576 | ||
1577 | if (!tg->se[0]) | ||
1578 | return 0; | ||
1579 | |||
1580 | local_irq_save(flags); | ||
1581 | usd = &__get_cpu_var(update_shares_data); | ||
1582 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1583 | for_each_cpu(i, sched_domain_span(sd)) { |
1584 | weight = tg->cfs_rq[i]->load.weight; | ||
1585 | usd->rq_weight[i] = weight; | ||
1586 | |||
1568 | /* | 1587 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1588 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1589 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1590 | * run here it will not get delayed by group starvation. |
1572 | */ | 1591 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1592 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1593 | weight = NICE_0_LOAD; |
1576 | 1594 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1595 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1596 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1597 | } |
@@ -1586,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1603 | shares = tg->shares; |
1587 | 1604 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1605 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1606 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1607 | |||
1608 | local_irq_restore(flags); | ||
1590 | 1609 | ||
1591 | return 0; | 1610 | return 0; |
1592 | } | 1611 | } |
@@ -1616,8 +1635,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1635 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1636 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1637 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1638 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1639 | u64 now; |
1640 | |||
1641 | if (root_task_group_empty()) | ||
1642 | return; | ||
1643 | |||
1644 | now = cpu_clock(raw_smp_processor_id()); | ||
1645 | elapsed = now - sd->last_update; | ||
1621 | 1646 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1647 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1648 | sd->last_update = now; |
@@ -1627,6 +1652,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1652 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1653 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1654 | { |
1655 | if (root_task_group_empty()) | ||
1656 | return; | ||
1657 | |||
1630 | spin_unlock(&rq->lock); | 1658 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1659 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1660 | spin_lock(&rq->lock); |
@@ -1634,6 +1662,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1662 | ||
1635 | static void update_h_load(long cpu) | 1663 | static void update_h_load(long cpu) |
1636 | { | 1664 | { |
1665 | if (root_task_group_empty()) | ||
1666 | return; | ||
1667 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1668 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1669 | } |
1639 | 1670 | ||
@@ -2637,9 +2668,32 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2637 | set_task_cpu(p, cpu); | 2668 | set_task_cpu(p, cpu); |
2638 | 2669 | ||
2639 | /* | 2670 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2671 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2672 | */ |
2642 | p->prio = current->normal_prio; | 2673 | p->prio = current->normal_prio; |
2674 | |||
2675 | /* | ||
2676 | * Revert to default priority/policy on fork if requested. | ||
2677 | */ | ||
2678 | if (unlikely(p->sched_reset_on_fork)) { | ||
2679 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
2680 | p->policy = SCHED_NORMAL; | ||
2681 | |||
2682 | if (p->normal_prio < DEFAULT_PRIO) | ||
2683 | p->prio = DEFAULT_PRIO; | ||
2684 | |||
2685 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2686 | p->static_prio = NICE_TO_PRIO(0); | ||
2687 | set_load_weight(p); | ||
2688 | } | ||
2689 | |||
2690 | /* | ||
2691 | * We don't need the reset flag anymore after the fork. It has | ||
2692 | * fulfilled its duty: | ||
2693 | */ | ||
2694 | p->sched_reset_on_fork = 0; | ||
2695 | } | ||
2696 | |||
2643 | if (!rt_prio(p->prio)) | 2697 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2698 | p->sched_class = &fair_sched_class; |
2645 | 2699 | ||
@@ -2796,12 +2850,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2850 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2851 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2852 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2853 | ||
2806 | rq->prev_mm = NULL; | 2854 | rq->prev_mm = NULL; |
2807 | 2855 | ||
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2820 | finish_arch_switch(prev); | 2868 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2869 | perf_counter_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2870 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2871 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2872 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2873 | if (mm) |
@@ -2838,6 +2882,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2882 | } |
2839 | } | 2883 | } |
2840 | 2884 | ||
2885 | #ifdef CONFIG_SMP | ||
2886 | |||
2887 | /* assumes rq->lock is held */ | ||
2888 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2889 | { | ||
2890 | if (prev->sched_class->pre_schedule) | ||
2891 | prev->sched_class->pre_schedule(rq, prev); | ||
2892 | } | ||
2893 | |||
2894 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2895 | static inline void post_schedule(struct rq *rq) | ||
2896 | { | ||
2897 | if (rq->post_schedule) { | ||
2898 | unsigned long flags; | ||
2899 | |||
2900 | spin_lock_irqsave(&rq->lock, flags); | ||
2901 | if (rq->curr->sched_class->post_schedule) | ||
2902 | rq->curr->sched_class->post_schedule(rq); | ||
2903 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2904 | |||
2905 | rq->post_schedule = 0; | ||
2906 | } | ||
2907 | } | ||
2908 | |||
2909 | #else | ||
2910 | |||
2911 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2912 | { | ||
2913 | } | ||
2914 | |||
2915 | static inline void post_schedule(struct rq *rq) | ||
2916 | { | ||
2917 | } | ||
2918 | |||
2919 | #endif | ||
2920 | |||
2841 | /** | 2921 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2922 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2923 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2928,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2928 | struct rq *rq = this_rq(); |
2849 | 2929 | ||
2850 | finish_task_switch(rq, prev); | 2930 | finish_task_switch(rq, prev); |
2931 | |||
2932 | /* | ||
2933 | * FIXME: do we need to worry about rq being invalidated by the | ||
2934 | * task_switch? | ||
2935 | */ | ||
2936 | post_schedule(rq); | ||
2937 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2938 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2939 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2940 | preempt_enable(); |
@@ -3379,9 +3466,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3466 | { |
3380 | const struct sched_class *class; | 3467 | const struct sched_class *class; |
3381 | 3468 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3469 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3470 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3471 | return 1; |
3472 | } | ||
3385 | 3473 | ||
3386 | return 0; | 3474 | return 0; |
3387 | } | 3475 | } |
@@ -5349,10 +5437,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5437 | switch_count = &prev->nvcsw; |
5350 | } | 5438 | } |
5351 | 5439 | ||
5352 | #ifdef CONFIG_SMP | 5440 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5441 | ||
5357 | if (unlikely(!rq->nr_running)) | 5442 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5443 | idle_balance(cpu, rq); |
@@ -5378,6 +5463,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5463 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5464 | spin_unlock_irq(&rq->lock); |
5380 | 5465 | ||
5466 | post_schedule(rq); | ||
5467 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5468 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5469 | goto need_resched_nonpreemptible; |
5383 | 5470 | ||
@@ -6123,17 +6210,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6210 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6211 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6212 | struct rq *rq; |
6213 | int reset_on_fork; | ||
6126 | 6214 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6215 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6216 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6217 | recheck: |
6130 | /* double check policy once rq lock held */ | 6218 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6219 | if (policy < 0) { |
6220 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6221 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6222 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6223 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6224 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6225 | |
6226 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6227 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6228 | policy != SCHED_IDLE) | ||
6229 | return -EINVAL; | ||
6230 | } | ||
6231 | |||
6137 | /* | 6232 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6233 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6234 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6272,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6272 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6273 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6274 | return -EPERM; |
6275 | |||
6276 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6277 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6278 | return -EPERM; | ||
6180 | } | 6279 | } |
6181 | 6280 | ||
6182 | if (user) { | 6281 | if (user) { |
@@ -6220,6 +6319,8 @@ recheck: | |||
6220 | if (running) | 6319 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6320 | p->sched_class->put_prev_task(rq, p); |
6222 | 6321 | ||
6322 | p->sched_reset_on_fork = reset_on_fork; | ||
6323 | |||
6223 | oldprio = p->prio; | 6324 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6325 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6326 | ||
@@ -6336,14 +6437,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6437 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6438 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6439 | if (!retval) |
6339 | retval = p->policy; | 6440 | retval = p->policy |
6441 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6442 | } |
6341 | read_unlock(&tasklist_lock); | 6443 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6444 | return retval; |
6343 | } | 6445 | } |
6344 | 6446 | ||
6345 | /** | 6447 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6448 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6449 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6450 | * @param: structure containing the RT priority. |
6349 | */ | 6451 | */ |
@@ -6571,19 +6673,9 @@ static inline int should_resched(void) | |||
6571 | 6673 | ||
6572 | static void __cond_resched(void) | 6674 | static void __cond_resched(void) |
6573 | { | 6675 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6676 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6677 | schedule(); |
6576 | #endif | 6678 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6679 | } |
6588 | 6680 | ||
6589 | int __sched _cond_resched(void) | 6681 | int __sched _cond_resched(void) |
@@ -6597,14 +6689,14 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6689 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6690 | ||
6599 | /* | 6691 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6692 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6693 | * call schedule, and on return reacquire the lock. |
6602 | * | 6694 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6695 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6696 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6697 | * spin_unlock(), once by hand). |
6606 | */ | 6698 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6699 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6700 | { |
6609 | int resched = should_resched(); | 6701 | int resched = should_resched(); |
6610 | int ret = 0; | 6702 | int ret = 0; |
@@ -6620,9 +6712,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6620 | } | 6712 | } |
6621 | return ret; | 6713 | return ret; |
6622 | } | 6714 | } |
6623 | EXPORT_SYMBOL(cond_resched_lock); | 6715 | EXPORT_SYMBOL(__cond_resched_lock); |
6624 | 6716 | ||
6625 | int __sched cond_resched_softirq(void) | 6717 | int __sched __cond_resched_softirq(void) |
6626 | { | 6718 | { |
6627 | BUG_ON(!in_softirq()); | 6719 | BUG_ON(!in_softirq()); |
6628 | 6720 | ||
@@ -6634,7 +6726,7 @@ int __sched cond_resched_softirq(void) | |||
6634 | } | 6726 | } |
6635 | return 0; | 6727 | return 0; |
6636 | } | 6728 | } |
6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6729 | EXPORT_SYMBOL(__cond_resched_softirq); |
6638 | 6730 | ||
6639 | /** | 6731 | /** |
6640 | * yield - yield the current processor to other threads. | 6732 | * yield - yield the current processor to other threads. |
@@ -6658,11 +6750,13 @@ EXPORT_SYMBOL(yield); | |||
6658 | */ | 6750 | */ |
6659 | void __sched io_schedule(void) | 6751 | void __sched io_schedule(void) |
6660 | { | 6752 | { |
6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6753 | struct rq *rq = raw_rq(); |
6662 | 6754 | ||
6663 | delayacct_blkio_start(); | 6755 | delayacct_blkio_start(); |
6664 | atomic_inc(&rq->nr_iowait); | 6756 | atomic_inc(&rq->nr_iowait); |
6757 | current->in_iowait = 1; | ||
6665 | schedule(); | 6758 | schedule(); |
6759 | current->in_iowait = 0; | ||
6666 | atomic_dec(&rq->nr_iowait); | 6760 | atomic_dec(&rq->nr_iowait); |
6667 | delayacct_blkio_end(); | 6761 | delayacct_blkio_end(); |
6668 | } | 6762 | } |
@@ -6670,12 +6764,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6670 | 6764 | ||
6671 | long __sched io_schedule_timeout(long timeout) | 6765 | long __sched io_schedule_timeout(long timeout) |
6672 | { | 6766 | { |
6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6767 | struct rq *rq = raw_rq(); |
6674 | long ret; | 6768 | long ret; |
6675 | 6769 | ||
6676 | delayacct_blkio_start(); | 6770 | delayacct_blkio_start(); |
6677 | atomic_inc(&rq->nr_iowait); | 6771 | atomic_inc(&rq->nr_iowait); |
6772 | current->in_iowait = 1; | ||
6678 | ret = schedule_timeout(timeout); | 6773 | ret = schedule_timeout(timeout); |
6774 | current->in_iowait = 0; | ||
6679 | atomic_dec(&rq->nr_iowait); | 6775 | atomic_dec(&rq->nr_iowait); |
6680 | delayacct_blkio_end(); | 6776 | delayacct_blkio_end(); |
6681 | return ret; | 6777 | return ret; |
@@ -6992,8 +7088,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6992 | 7088 | ||
6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7089 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6994 | /* Need help from migration thread: drop lock and wait. */ | 7090 | /* Need help from migration thread: drop lock and wait. */ |
7091 | struct task_struct *mt = rq->migration_thread; | ||
7092 | |||
7093 | get_task_struct(mt); | ||
6995 | task_rq_unlock(rq, &flags); | 7094 | task_rq_unlock(rq, &flags); |
6996 | wake_up_process(rq->migration_thread); | 7095 | wake_up_process(rq->migration_thread); |
7096 | put_task_struct(mt); | ||
6997 | wait_for_completion(&req.done); | 7097 | wait_for_completion(&req.done); |
6998 | tlb_migrate_finish(p->mm); | 7098 | tlb_migrate_finish(p->mm); |
6999 | return 0; | 7099 | return 0; |
@@ -7625,7 +7725,7 @@ static int __init migration_init(void) | |||
7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7725 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7626 | register_cpu_notifier(&migration_notifier); | 7726 | register_cpu_notifier(&migration_notifier); |
7627 | 7727 | ||
7628 | return err; | 7728 | return 0; |
7629 | } | 7729 | } |
7630 | early_initcall(migration_init); | 7730 | early_initcall(migration_init); |
7631 | #endif | 7731 | #endif |
@@ -7841,7 +7941,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7841 | rq->rd = rd; | 7941 | rq->rd = rd; |
7842 | 7942 | ||
7843 | cpumask_set_cpu(rq->cpu, rd->span); | 7943 | cpumask_set_cpu(rq->cpu, rd->span); |
7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 7944 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7845 | set_rq_online(rq); | 7945 | set_rq_online(rq); |
7846 | 7946 | ||
7847 | spin_unlock_irqrestore(&rq->lock, flags); | 7947 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -8091,6 +8191,39 @@ struct static_sched_domain { | |||
8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8191 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8092 | }; | 8192 | }; |
8093 | 8193 | ||
8194 | struct s_data { | ||
8195 | #ifdef CONFIG_NUMA | ||
8196 | int sd_allnodes; | ||
8197 | cpumask_var_t domainspan; | ||
8198 | cpumask_var_t covered; | ||
8199 | cpumask_var_t notcovered; | ||
8200 | #endif | ||
8201 | cpumask_var_t nodemask; | ||
8202 | cpumask_var_t this_sibling_map; | ||
8203 | cpumask_var_t this_core_map; | ||
8204 | cpumask_var_t send_covered; | ||
8205 | cpumask_var_t tmpmask; | ||
8206 | struct sched_group **sched_group_nodes; | ||
8207 | struct root_domain *rd; | ||
8208 | }; | ||
8209 | |||
8210 | enum s_alloc { | ||
8211 | sa_sched_groups = 0, | ||
8212 | sa_rootdomain, | ||
8213 | sa_tmpmask, | ||
8214 | sa_send_covered, | ||
8215 | sa_this_core_map, | ||
8216 | sa_this_sibling_map, | ||
8217 | sa_nodemask, | ||
8218 | sa_sched_group_nodes, | ||
8219 | #ifdef CONFIG_NUMA | ||
8220 | sa_notcovered, | ||
8221 | sa_covered, | ||
8222 | sa_domainspan, | ||
8223 | #endif | ||
8224 | sa_none, | ||
8225 | }; | ||
8226 | |||
8094 | /* | 8227 | /* |
8095 | * SMT sched-domains: | 8228 | * SMT sched-domains: |
8096 | */ | 8229 | */ |
@@ -8213,6 +8346,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8213 | sg = sg->next; | 8346 | sg = sg->next; |
8214 | } while (sg != group_head); | 8347 | } while (sg != group_head); |
8215 | } | 8348 | } |
8349 | |||
8350 | static int build_numa_sched_groups(struct s_data *d, | ||
8351 | const struct cpumask *cpu_map, int num) | ||
8352 | { | ||
8353 | struct sched_domain *sd; | ||
8354 | struct sched_group *sg, *prev; | ||
8355 | int n, j; | ||
8356 | |||
8357 | cpumask_clear(d->covered); | ||
8358 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8359 | if (cpumask_empty(d->nodemask)) { | ||
8360 | d->sched_group_nodes[num] = NULL; | ||
8361 | goto out; | ||
8362 | } | ||
8363 | |||
8364 | sched_domain_node_span(num, d->domainspan); | ||
8365 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8366 | |||
8367 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8368 | GFP_KERNEL, num); | ||
8369 | if (!sg) { | ||
8370 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8371 | num); | ||
8372 | return -ENOMEM; | ||
8373 | } | ||
8374 | d->sched_group_nodes[num] = sg; | ||
8375 | |||
8376 | for_each_cpu(j, d->nodemask) { | ||
8377 | sd = &per_cpu(node_domains, j).sd; | ||
8378 | sd->groups = sg; | ||
8379 | } | ||
8380 | |||
8381 | sg->__cpu_power = 0; | ||
8382 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8383 | sg->next = sg; | ||
8384 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8385 | |||
8386 | prev = sg; | ||
8387 | for (j = 0; j < nr_node_ids; j++) { | ||
8388 | n = (num + j) % nr_node_ids; | ||
8389 | cpumask_complement(d->notcovered, d->covered); | ||
8390 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8391 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8392 | if (cpumask_empty(d->tmpmask)) | ||
8393 | break; | ||
8394 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8395 | if (cpumask_empty(d->tmpmask)) | ||
8396 | continue; | ||
8397 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8398 | GFP_KERNEL, num); | ||
8399 | if (!sg) { | ||
8400 | printk(KERN_WARNING | ||
8401 | "Can not alloc domain group for node %d\n", j); | ||
8402 | return -ENOMEM; | ||
8403 | } | ||
8404 | sg->__cpu_power = 0; | ||
8405 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8406 | sg->next = prev->next; | ||
8407 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8408 | prev->next = sg; | ||
8409 | prev = sg; | ||
8410 | } | ||
8411 | out: | ||
8412 | return 0; | ||
8413 | } | ||
8216 | #endif /* CONFIG_NUMA */ | 8414 | #endif /* CONFIG_NUMA */ |
8217 | 8415 | ||
8218 | #ifdef CONFIG_NUMA | 8416 | #ifdef CONFIG_NUMA |
@@ -8378,280 +8576,285 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8378 | } | 8576 | } |
8379 | } | 8577 | } |
8380 | 8578 | ||
8381 | /* | 8579 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8580 | const struct cpumask *cpu_map) |
8383 | * to the individual cpus | 8581 | { |
8384 | */ | 8582 | switch (what) { |
8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | 8583 | case sa_sched_groups: |
8386 | struct sched_domain_attr *attr) | 8584 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ |
8387 | { | 8585 | d->sched_group_nodes = NULL; |
8388 | int i, err = -ENOMEM; | 8586 | case sa_rootdomain: |
8389 | struct root_domain *rd; | 8587 | free_rootdomain(d->rd); /* fall through */ |
8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | 8588 | case sa_tmpmask: |
8391 | tmpmask; | 8589 | free_cpumask_var(d->tmpmask); /* fall through */ |
8590 | case sa_send_covered: | ||
8591 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8592 | case sa_this_core_map: | ||
8593 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8594 | case sa_this_sibling_map: | ||
8595 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8596 | case sa_nodemask: | ||
8597 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8598 | case sa_sched_group_nodes: | ||
8392 | #ifdef CONFIG_NUMA | 8599 | #ifdef CONFIG_NUMA |
8393 | cpumask_var_t domainspan, covered, notcovered; | 8600 | kfree(d->sched_group_nodes); /* fall through */ |
8394 | struct sched_group **sched_group_nodes = NULL; | 8601 | case sa_notcovered: |
8395 | int sd_allnodes = 0; | 8602 | free_cpumask_var(d->notcovered); /* fall through */ |
8396 | 8603 | case sa_covered: | |
8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8604 | free_cpumask_var(d->covered); /* fall through */ |
8398 | goto out; | 8605 | case sa_domainspan: |
8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8606 | free_cpumask_var(d->domainspan); /* fall through */ |
8400 | goto free_domainspan; | 8607 | #endif |
8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8608 | case sa_none: |
8402 | goto free_covered; | 8609 | break; |
8403 | #endif | 8610 | } |
8404 | 8611 | } | |
8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8406 | goto free_notcovered; | ||
8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8408 | goto free_nodemask; | ||
8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8410 | goto free_this_sibling_map; | ||
8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8412 | goto free_this_core_map; | ||
8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8414 | goto free_send_covered; | ||
8415 | 8612 | ||
8613 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
8614 | const struct cpumask *cpu_map) | ||
8615 | { | ||
8416 | #ifdef CONFIG_NUMA | 8616 | #ifdef CONFIG_NUMA |
8417 | /* | 8617 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8418 | * Allocate the per-node list of sched groups | 8618 | return sa_none; |
8419 | */ | 8619 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8620 | return sa_domainspan; |
8421 | GFP_KERNEL); | 8621 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8422 | if (!sched_group_nodes) { | 8622 | return sa_covered; |
8623 | /* Allocate the per-node list of sched groups */ | ||
8624 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
8625 | sizeof(struct sched_group *), GFP_KERNEL); | ||
8626 | if (!d->sched_group_nodes) { | ||
8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8627 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8424 | goto free_tmpmask; | 8628 | return sa_notcovered; |
8425 | } | 8629 | } |
8426 | #endif | 8630 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8427 | 8631 | #endif | |
8428 | rd = alloc_rootdomain(); | 8632 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8429 | if (!rd) { | 8633 | return sa_sched_group_nodes; |
8634 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8635 | return sa_nodemask; | ||
8636 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8637 | return sa_this_sibling_map; | ||
8638 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8639 | return sa_this_core_map; | ||
8640 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8641 | return sa_send_covered; | ||
8642 | d->rd = alloc_rootdomain(); | ||
8643 | if (!d->rd) { | ||
8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8644 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8431 | goto free_sched_groups; | 8645 | return sa_tmpmask; |
8432 | } | 8646 | } |
8647 | return sa_rootdomain; | ||
8648 | } | ||
8433 | 8649 | ||
8650 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8651 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8652 | { | ||
8653 | struct sched_domain *sd = NULL; | ||
8434 | #ifdef CONFIG_NUMA | 8654 | #ifdef CONFIG_NUMA |
8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8655 | struct sched_domain *parent; |
8436 | #endif | ||
8437 | |||
8438 | /* | ||
8439 | * Set up domains for cpus specified by the cpu_map. | ||
8440 | */ | ||
8441 | for_each_cpu(i, cpu_map) { | ||
8442 | struct sched_domain *sd = NULL, *p; | ||
8443 | |||
8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
8445 | |||
8446 | #ifdef CONFIG_NUMA | ||
8447 | if (cpumask_weight(cpu_map) > | ||
8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8450 | SD_INIT(sd, ALLNODES); | ||
8451 | set_domain_attribute(sd, attr); | ||
8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8454 | p = sd; | ||
8455 | sd_allnodes = 1; | ||
8456 | } else | ||
8457 | p = NULL; | ||
8458 | 8656 | ||
8459 | sd = &per_cpu(node_domains, i).sd; | 8657 | d->sd_allnodes = 0; |
8460 | SD_INIT(sd, NODE); | 8658 | if (cpumask_weight(cpu_map) > |
8659 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
8660 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8661 | SD_INIT(sd, ALLNODES); | ||
8461 | set_domain_attribute(sd, attr); | 8662 | set_domain_attribute(sd, attr); |
8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8663 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8463 | sd->parent = p; | 8664 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8464 | if (p) | 8665 | d->sd_allnodes = 1; |
8465 | p->child = sd; | 8666 | } |
8466 | cpumask_and(sched_domain_span(sd), | 8667 | parent = sd; |
8467 | sched_domain_span(sd), cpu_map); | 8668 | |
8669 | sd = &per_cpu(node_domains, i).sd; | ||
8670 | SD_INIT(sd, NODE); | ||
8671 | set_domain_attribute(sd, attr); | ||
8672 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8673 | sd->parent = parent; | ||
8674 | if (parent) | ||
8675 | parent->child = sd; | ||
8676 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8468 | #endif | 8677 | #endif |
8678 | return sd; | ||
8679 | } | ||
8469 | 8680 | ||
8470 | p = sd; | 8681 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8471 | sd = &per_cpu(phys_domains, i).sd; | 8682 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8472 | SD_INIT(sd, CPU); | 8683 | struct sched_domain *parent, int i) |
8473 | set_domain_attribute(sd, attr); | 8684 | { |
8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8685 | struct sched_domain *sd; |
8475 | sd->parent = p; | 8686 | sd = &per_cpu(phys_domains, i).sd; |
8476 | if (p) | 8687 | SD_INIT(sd, CPU); |
8477 | p->child = sd; | 8688 | set_domain_attribute(sd, attr); |
8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8689 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8690 | sd->parent = parent; | ||
8691 | if (parent) | ||
8692 | parent->child = sd; | ||
8693 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8694 | return sd; | ||
8695 | } | ||
8479 | 8696 | ||
8697 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8698 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8699 | struct sched_domain *parent, int i) | ||
8700 | { | ||
8701 | struct sched_domain *sd = parent; | ||
8480 | #ifdef CONFIG_SCHED_MC | 8702 | #ifdef CONFIG_SCHED_MC |
8481 | p = sd; | 8703 | sd = &per_cpu(core_domains, i).sd; |
8482 | sd = &per_cpu(core_domains, i).sd; | 8704 | SD_INIT(sd, MC); |
8483 | SD_INIT(sd, MC); | 8705 | set_domain_attribute(sd, attr); |
8484 | set_domain_attribute(sd, attr); | 8706 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8707 | sd->parent = parent; |
8486 | cpu_coregroup_mask(i)); | 8708 | parent->child = sd; |
8487 | sd->parent = p; | 8709 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8488 | p->child = sd; | ||
8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8490 | #endif | 8710 | #endif |
8711 | return sd; | ||
8712 | } | ||
8491 | 8713 | ||
8714 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8715 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8716 | struct sched_domain *parent, int i) | ||
8717 | { | ||
8718 | struct sched_domain *sd = parent; | ||
8492 | #ifdef CONFIG_SCHED_SMT | 8719 | #ifdef CONFIG_SCHED_SMT |
8493 | p = sd; | 8720 | sd = &per_cpu(cpu_domains, i).sd; |
8494 | sd = &per_cpu(cpu_domains, i).sd; | 8721 | SD_INIT(sd, SIBLING); |
8495 | SD_INIT(sd, SIBLING); | 8722 | set_domain_attribute(sd, attr); |
8496 | set_domain_attribute(sd, attr); | 8723 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8497 | cpumask_and(sched_domain_span(sd), | 8724 | sd->parent = parent; |
8498 | topology_thread_cpumask(i), cpu_map); | 8725 | parent->child = sd; |
8499 | sd->parent = p; | 8726 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8500 | p->child = sd; | ||
8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8502 | #endif | 8727 | #endif |
8503 | } | 8728 | return sd; |
8729 | } | ||
8504 | 8730 | ||
8731 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8732 | const struct cpumask *cpu_map, int cpu) | ||
8733 | { | ||
8734 | switch (l) { | ||
8505 | #ifdef CONFIG_SCHED_SMT | 8735 | #ifdef CONFIG_SCHED_SMT |
8506 | /* Set up CPU (sibling) groups */ | 8736 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8507 | for_each_cpu(i, cpu_map) { | 8737 | cpumask_and(d->this_sibling_map, cpu_map, |
8508 | cpumask_and(this_sibling_map, | 8738 | topology_thread_cpumask(cpu)); |
8509 | topology_thread_cpumask(i), cpu_map); | 8739 | if (cpu == cpumask_first(d->this_sibling_map)) |
8510 | if (i != cpumask_first(this_sibling_map)) | 8740 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8511 | continue; | 8741 | &cpu_to_cpu_group, |
8512 | 8742 | d->send_covered, d->tmpmask); | |
8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8743 | break; |
8514 | &cpu_to_cpu_group, | ||
8515 | send_covered, tmpmask); | ||
8516 | } | ||
8517 | #endif | 8744 | #endif |
8518 | |||
8519 | #ifdef CONFIG_SCHED_MC | 8745 | #ifdef CONFIG_SCHED_MC |
8520 | /* Set up multi-core groups */ | 8746 | case SD_LV_MC: /* set up multi-core groups */ |
8521 | for_each_cpu(i, cpu_map) { | 8747 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8748 | if (cpu == cpumask_first(d->this_core_map)) |
8523 | if (i != cpumask_first(this_core_map)) | 8749 | init_sched_build_groups(d->this_core_map, cpu_map, |
8524 | continue; | 8750 | &cpu_to_core_group, |
8525 | 8751 | d->send_covered, d->tmpmask); | |
8526 | init_sched_build_groups(this_core_map, cpu_map, | 8752 | break; |
8527 | &cpu_to_core_group, | ||
8528 | send_covered, tmpmask); | ||
8529 | } | ||
8530 | #endif | 8753 | #endif |
8531 | 8754 | case SD_LV_CPU: /* set up physical groups */ | |
8532 | /* Set up physical groups */ | 8755 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8533 | for (i = 0; i < nr_node_ids; i++) { | 8756 | if (!cpumask_empty(d->nodemask)) |
8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8757 | init_sched_build_groups(d->nodemask, cpu_map, |
8535 | if (cpumask_empty(nodemask)) | 8758 | &cpu_to_phys_group, |
8536 | continue; | 8759 | d->send_covered, d->tmpmask); |
8537 | 8760 | break; | |
8538 | init_sched_build_groups(nodemask, cpu_map, | ||
8539 | &cpu_to_phys_group, | ||
8540 | send_covered, tmpmask); | ||
8541 | } | ||
8542 | |||
8543 | #ifdef CONFIG_NUMA | 8761 | #ifdef CONFIG_NUMA |
8544 | /* Set up node groups */ | 8762 | case SD_LV_ALLNODES: |
8545 | if (sd_allnodes) { | 8763 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8546 | init_sched_build_groups(cpu_map, cpu_map, | 8764 | d->send_covered, d->tmpmask); |
8547 | &cpu_to_allnodes_group, | 8765 | break; |
8548 | send_covered, tmpmask); | 8766 | #endif |
8767 | default: | ||
8768 | break; | ||
8549 | } | 8769 | } |
8770 | } | ||
8550 | 8771 | ||
8551 | for (i = 0; i < nr_node_ids; i++) { | 8772 | /* |
8552 | /* Set up node groups */ | 8773 | * Build sched domains for a given set of cpus and attach the sched domains |
8553 | struct sched_group *sg, *prev; | 8774 | * to the individual cpus |
8554 | int j; | 8775 | */ |
8555 | 8776 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8556 | cpumask_clear(covered); | 8777 | struct sched_domain_attr *attr) |
8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8778 | { |
8558 | if (cpumask_empty(nodemask)) { | 8779 | enum s_alloc alloc_state = sa_none; |
8559 | sched_group_nodes[i] = NULL; | 8780 | struct s_data d; |
8560 | continue; | 8781 | struct sched_domain *sd; |
8561 | } | 8782 | int i; |
8783 | #ifdef CONFIG_NUMA | ||
8784 | d.sd_allnodes = 0; | ||
8785 | #endif | ||
8562 | 8786 | ||
8563 | sched_domain_node_span(i, domainspan); | 8787 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8788 | if (alloc_state != sa_rootdomain) |
8789 | goto error; | ||
8790 | alloc_state = sa_sched_groups; | ||
8565 | 8791 | ||
8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8792 | /* |
8567 | GFP_KERNEL, i); | 8793 | * Set up domains for cpus specified by the cpu_map. |
8568 | if (!sg) { | 8794 | */ |
8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8795 | for_each_cpu(i, cpu_map) { |
8570 | "node %d\n", i); | 8796 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8571 | goto error; | 8797 | cpu_map); |
8572 | } | ||
8573 | sched_group_nodes[i] = sg; | ||
8574 | for_each_cpu(j, nodemask) { | ||
8575 | struct sched_domain *sd; | ||
8576 | 8798 | ||
8577 | sd = &per_cpu(node_domains, j).sd; | 8799 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8578 | sd->groups = sg; | 8800 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8579 | } | 8801 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8580 | sg->__cpu_power = 0; | 8802 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8803 | } |
8582 | sg->next = sg; | ||
8583 | cpumask_or(covered, covered, nodemask); | ||
8584 | prev = sg; | ||
8585 | 8804 | ||
8586 | for (j = 0; j < nr_node_ids; j++) { | 8805 | for_each_cpu(i, cpu_map) { |
8587 | int n = (i + j) % nr_node_ids; | 8806 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8807 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8808 | } | ||
8588 | 8809 | ||
8589 | cpumask_complement(notcovered, covered); | 8810 | /* Set up physical groups */ |
8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8811 | for (i = 0; i < nr_node_ids; i++) |
8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8812 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8592 | if (cpumask_empty(tmpmask)) | ||
8593 | break; | ||
8594 | 8813 | ||
8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8814 | #ifdef CONFIG_NUMA |
8596 | if (cpumask_empty(tmpmask)) | 8815 | /* Set up node groups */ |
8597 | continue; | 8816 | if (d.sd_allnodes) |
8817 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8598 | 8818 | ||
8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8819 | for (i = 0; i < nr_node_ids; i++) |
8600 | cpumask_size(), | 8820 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8601 | GFP_KERNEL, i); | 8821 | goto error; |
8602 | if (!sg) { | ||
8603 | printk(KERN_WARNING | ||
8604 | "Can not alloc domain group for node %d\n", j); | ||
8605 | goto error; | ||
8606 | } | ||
8607 | sg->__cpu_power = 0; | ||
8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8609 | sg->next = prev->next; | ||
8610 | cpumask_or(covered, covered, tmpmask); | ||
8611 | prev->next = sg; | ||
8612 | prev = sg; | ||
8613 | } | ||
8614 | } | ||
8615 | #endif | 8822 | #endif |
8616 | 8823 | ||
8617 | /* Calculate CPU power for physical packages and nodes */ | 8824 | /* Calculate CPU power for physical packages and nodes */ |
8618 | #ifdef CONFIG_SCHED_SMT | 8825 | #ifdef CONFIG_SCHED_SMT |
8619 | for_each_cpu(i, cpu_map) { | 8826 | for_each_cpu(i, cpu_map) { |
8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8827 | sd = &per_cpu(cpu_domains, i).sd; |
8621 | |||
8622 | init_sched_groups_power(i, sd); | 8828 | init_sched_groups_power(i, sd); |
8623 | } | 8829 | } |
8624 | #endif | 8830 | #endif |
8625 | #ifdef CONFIG_SCHED_MC | 8831 | #ifdef CONFIG_SCHED_MC |
8626 | for_each_cpu(i, cpu_map) { | 8832 | for_each_cpu(i, cpu_map) { |
8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8833 | sd = &per_cpu(core_domains, i).sd; |
8628 | |||
8629 | init_sched_groups_power(i, sd); | 8834 | init_sched_groups_power(i, sd); |
8630 | } | 8835 | } |
8631 | #endif | 8836 | #endif |
8632 | 8837 | ||
8633 | for_each_cpu(i, cpu_map) { | 8838 | for_each_cpu(i, cpu_map) { |
8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8839 | sd = &per_cpu(phys_domains, i).sd; |
8635 | |||
8636 | init_sched_groups_power(i, sd); | 8840 | init_sched_groups_power(i, sd); |
8637 | } | 8841 | } |
8638 | 8842 | ||
8639 | #ifdef CONFIG_NUMA | 8843 | #ifdef CONFIG_NUMA |
8640 | for (i = 0; i < nr_node_ids; i++) | 8844 | for (i = 0; i < nr_node_ids; i++) |
8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8845 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8642 | 8846 | ||
8643 | if (sd_allnodes) { | 8847 | if (d.sd_allnodes) { |
8644 | struct sched_group *sg; | 8848 | struct sched_group *sg; |
8645 | 8849 | ||
8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8850 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8647 | tmpmask); | 8851 | d.tmpmask); |
8648 | init_numa_sched_groups_power(sg); | 8852 | init_numa_sched_groups_power(sg); |
8649 | } | 8853 | } |
8650 | #endif | 8854 | #endif |
8651 | 8855 | ||
8652 | /* Attach the domains */ | 8856 | /* Attach the domains */ |
8653 | for_each_cpu(i, cpu_map) { | 8857 | for_each_cpu(i, cpu_map) { |
8654 | struct sched_domain *sd; | ||
8655 | #ifdef CONFIG_SCHED_SMT | 8858 | #ifdef CONFIG_SCHED_SMT |
8656 | sd = &per_cpu(cpu_domains, i).sd; | 8859 | sd = &per_cpu(cpu_domains, i).sd; |
8657 | #elif defined(CONFIG_SCHED_MC) | 8860 | #elif defined(CONFIG_SCHED_MC) |
@@ -8659,44 +8862,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8659 | #else | 8862 | #else |
8660 | sd = &per_cpu(phys_domains, i).sd; | 8863 | sd = &per_cpu(phys_domains, i).sd; |
8661 | #endif | 8864 | #endif |
8662 | cpu_attach_domain(sd, rd, i); | 8865 | cpu_attach_domain(sd, d.rd, i); |
8663 | } | 8866 | } |
8664 | 8867 | ||
8665 | err = 0; | 8868 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8666 | 8869 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8667 | free_tmpmask: | 8870 | return 0; |
8668 | free_cpumask_var(tmpmask); | ||
8669 | free_send_covered: | ||
8670 | free_cpumask_var(send_covered); | ||
8671 | free_this_core_map: | ||
8672 | free_cpumask_var(this_core_map); | ||
8673 | free_this_sibling_map: | ||
8674 | free_cpumask_var(this_sibling_map); | ||
8675 | free_nodemask: | ||
8676 | free_cpumask_var(nodemask); | ||
8677 | free_notcovered: | ||
8678 | #ifdef CONFIG_NUMA | ||
8679 | free_cpumask_var(notcovered); | ||
8680 | free_covered: | ||
8681 | free_cpumask_var(covered); | ||
8682 | free_domainspan: | ||
8683 | free_cpumask_var(domainspan); | ||
8684 | out: | ||
8685 | #endif | ||
8686 | return err; | ||
8687 | |||
8688 | free_sched_groups: | ||
8689 | #ifdef CONFIG_NUMA | ||
8690 | kfree(sched_group_nodes); | ||
8691 | #endif | ||
8692 | goto free_tmpmask; | ||
8693 | 8871 | ||
8694 | #ifdef CONFIG_NUMA | ||
8695 | error: | 8872 | error: |
8696 | free_sched_groups(cpu_map, tmpmask); | 8873 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8697 | free_rootdomain(rd); | 8874 | return -ENOMEM; |
8698 | goto free_tmpmask; | ||
8699 | #endif | ||
8700 | } | 8875 | } |
8701 | 8876 | ||
8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 8877 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9304,11 +9479,11 @@ void __init sched_init(void) | |||
9304 | * system cpu resource, based on the weight assigned to root | 9479 | * system cpu resource, based on the weight assigned to root |
9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9480 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9481 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9307 | * (init_cfs_rq) and having one entity represent this group of | 9482 | * (init_tg_cfs_rq) and having one entity represent this group of |
9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9483 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9309 | */ | 9484 | */ |
9310 | init_tg_cfs_entry(&init_task_group, | 9485 | init_tg_cfs_entry(&init_task_group, |
9311 | &per_cpu(init_cfs_rq, i), | 9486 | &per_cpu(init_tg_cfs_rq, i), |
9312 | &per_cpu(init_sched_entity, i), i, 1, | 9487 | &per_cpu(init_sched_entity, i), i, 1, |
9313 | root_task_group.se[i]); | 9488 | root_task_group.se[i]); |
9314 | 9489 | ||
@@ -9334,6 +9509,7 @@ void __init sched_init(void) | |||
9334 | #ifdef CONFIG_SMP | 9509 | #ifdef CONFIG_SMP |
9335 | rq->sd = NULL; | 9510 | rq->sd = NULL; |
9336 | rq->rd = NULL; | 9511 | rq->rd = NULL; |
9512 | rq->post_schedule = 0; | ||
9337 | rq->active_balance = 0; | 9513 | rq->active_balance = 0; |
9338 | rq->next_balance = jiffies; | 9514 | rq->next_balance = jiffies; |
9339 | rq->push_cpu = 0; | 9515 | rq->push_cpu = 0; |
@@ -9398,13 +9574,20 @@ void __init sched_init(void) | |||
9398 | } | 9574 | } |
9399 | 9575 | ||
9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9576 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9401 | void __might_sleep(char *file, int line) | 9577 | static inline int preempt_count_equals(int preempt_offset) |
9578 | { | ||
9579 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9580 | |||
9581 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9582 | } | ||
9583 | |||
9584 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9402 | { | 9585 | { |
9403 | #ifdef in_atomic | 9586 | #ifdef in_atomic |
9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9587 | static unsigned long prev_jiffy; /* ratelimiting */ |
9405 | 9588 | ||
9406 | if ((!in_atomic() && !irqs_disabled()) || | 9589 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9590 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9408 | return; | 9591 | return; |
9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9592 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9410 | return; | 9593 | return; |