aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c823
1 files changed, 503 insertions, 320 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273..da1edc8277d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user)
309 309
310/* 310/*
311 * Root task group. 311 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 312 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 313 * be a child to this group.
314 */ 314 */
315struct task_group root_task_group; 315struct task_group root_task_group;
316 316
@@ -318,7 +318,7 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 318/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 320/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 321static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 322#endif /* CONFIG_FAIR_GROUP_SCHED */
323 323
324#ifdef CONFIG_RT_GROUP_SCHED 324#ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +616,7 @@ struct rq {
616 616
617 unsigned char idle_at_tick; 617 unsigned char idle_at_tick;
618 /* For active balancing */ 618 /* For active balancing */
619 int post_schedule;
619 int active_balance; 620 int active_balance;
620 int push_cpu; 621 int push_cpu;
621 /* cpu of this runqueue: */ 622 /* cpu of this runqueue: */
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 694#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 695#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 696#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
697#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 698
697inline void update_rq_clock(struct rq *rq) 699inline void update_rq_clock(struct rq *rq)
698{ 700{
@@ -1513,28 +1515,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1515
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1516#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1517
1518struct update_shares_data {
1519 unsigned long rq_weight[NR_CPUS];
1520};
1521
1522static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1523
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1524static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1525
1518/* 1526/*
1519 * Calculate and set the cpu's group shares. 1527 * Calculate and set the cpu's group shares.
1520 */ 1528 */
1521static void 1529static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1530 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1531 unsigned long sd_rq_weight,
1532 struct update_shares_data *usd)
1524{ 1533{
1525 unsigned long shares; 1534 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1535 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1536
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1537 rq_weight = usd->rq_weight[cpu];
1538 if (!rq_weight) {
1539 boost = 1;
1540 rq_weight = NICE_0_LOAD;
1541 }
1532 1542
1533 /* 1543 /*
1534 * \Sum shares * rq_weight 1544 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1545 * shares_i = -----------------------------
1536 * \Sum rq_weight 1546 * \Sum_j rq_weight_j
1537 *
1538 */ 1547 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1548 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1549 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1554,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1554 unsigned long flags;
1546 1555
1547 spin_lock_irqsave(&rq->lock, flags); 1556 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1557 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1558 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1559 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1560 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1561 }
@@ -1559,22 +1568,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1568 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1569static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1570{
1562 unsigned long weight, rq_weight = 0; 1571 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1572 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1573 struct sched_domain *sd = data;
1574 unsigned long flags;
1565 int i; 1575 int i;
1566 1576
1577 if (!tg->se[0])
1578 return 0;
1579
1580 local_irq_save(flags);
1581 usd = &__get_cpu_var(update_shares_data);
1582
1567 for_each_cpu(i, sched_domain_span(sd)) { 1583 for_each_cpu(i, sched_domain_span(sd)) {
1584 weight = tg->cfs_rq[i]->load.weight;
1585 usd->rq_weight[i] = weight;
1586
1568 /* 1587 /*
1569 * If there are currently no tasks on the cpu pretend there 1588 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1589 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1590 * run here it will not get delayed by group starvation.
1572 */ 1591 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1592 if (!weight)
1575 weight = NICE_0_LOAD; 1593 weight = NICE_0_LOAD;
1576 1594
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1595 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1596 shares += tg->cfs_rq[i]->shares;
1580 } 1597 }
@@ -1586,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1603 shares = tg->shares;
1587 1604
1588 for_each_cpu(i, sched_domain_span(sd)) 1605 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1606 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1607
1608 local_irq_restore(flags);
1590 1609
1591 return 0; 1610 return 0;
1592} 1611}
@@ -1616,8 +1635,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1635
1617static void update_shares(struct sched_domain *sd) 1636static void update_shares(struct sched_domain *sd)
1618{ 1637{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1638 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1639 u64 now;
1640
1641 if (root_task_group_empty())
1642 return;
1643
1644 now = cpu_clock(raw_smp_processor_id());
1645 elapsed = now - sd->last_update;
1621 1646
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1647 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1648 sd->last_update = now;
@@ -1627,6 +1652,9 @@ static void update_shares(struct sched_domain *sd)
1627 1652
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1653static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1654{
1655 if (root_task_group_empty())
1656 return;
1657
1630 spin_unlock(&rq->lock); 1658 spin_unlock(&rq->lock);
1631 update_shares(sd); 1659 update_shares(sd);
1632 spin_lock(&rq->lock); 1660 spin_lock(&rq->lock);
@@ -1634,6 +1662,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1662
1635static void update_h_load(long cpu) 1663static void update_h_load(long cpu)
1636{ 1664{
1665 if (root_task_group_empty())
1666 return;
1667
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1668 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1669}
1639 1670
@@ -2637,9 +2668,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
2637 set_task_cpu(p, cpu); 2668 set_task_cpu(p, cpu);
2638 2669
2639 /* 2670 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2671 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2672 */
2642 p->prio = current->normal_prio; 2673 p->prio = current->normal_prio;
2674
2675 /*
2676 * Revert to default priority/policy on fork if requested.
2677 */
2678 if (unlikely(p->sched_reset_on_fork)) {
2679 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2680 p->policy = SCHED_NORMAL;
2681
2682 if (p->normal_prio < DEFAULT_PRIO)
2683 p->prio = DEFAULT_PRIO;
2684
2685 if (PRIO_TO_NICE(p->static_prio) < 0) {
2686 p->static_prio = NICE_TO_PRIO(0);
2687 set_load_weight(p);
2688 }
2689
2690 /*
2691 * We don't need the reset flag anymore after the fork. It has
2692 * fulfilled its duty:
2693 */
2694 p->sched_reset_on_fork = 0;
2695 }
2696
2643 if (!rt_prio(p->prio)) 2697 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2698 p->sched_class = &fair_sched_class;
2645 2699
@@ -2796,12 +2850,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2850{
2797 struct mm_struct *mm = rq->prev_mm; 2851 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2852 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2853
2806 rq->prev_mm = NULL; 2854 rq->prev_mm = NULL;
2807 2855
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2868 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2869 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2870 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2871
2828 fire_sched_in_preempt_notifiers(current); 2872 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2873 if (mm)
@@ -2838,6 +2882,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2882 }
2839} 2883}
2840 2884
2885#ifdef CONFIG_SMP
2886
2887/* assumes rq->lock is held */
2888static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2889{
2890 if (prev->sched_class->pre_schedule)
2891 prev->sched_class->pre_schedule(rq, prev);
2892}
2893
2894/* rq->lock is NOT held, but preemption is disabled */
2895static inline void post_schedule(struct rq *rq)
2896{
2897 if (rq->post_schedule) {
2898 unsigned long flags;
2899
2900 spin_lock_irqsave(&rq->lock, flags);
2901 if (rq->curr->sched_class->post_schedule)
2902 rq->curr->sched_class->post_schedule(rq);
2903 spin_unlock_irqrestore(&rq->lock, flags);
2904
2905 rq->post_schedule = 0;
2906 }
2907}
2908
2909#else
2910
2911static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2912{
2913}
2914
2915static inline void post_schedule(struct rq *rq)
2916{
2917}
2918
2919#endif
2920
2841/** 2921/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2922 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2923 * @prev: the thread we just switched away from.
@@ -2848,6 +2928,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2928 struct rq *rq = this_rq();
2849 2929
2850 finish_task_switch(rq, prev); 2930 finish_task_switch(rq, prev);
2931
2932 /*
2933 * FIXME: do we need to worry about rq being invalidated by the
2934 * task_switch?
2935 */
2936 post_schedule(rq);
2937
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2938#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2939 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2940 preempt_enable();
@@ -3379,9 +3466,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3466{
3380 const struct sched_class *class; 3467 const struct sched_class *class;
3381 3468
3382 for (class = sched_class_highest; class; class = class->next) 3469 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3470 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3471 return 1;
3472 }
3385 3473
3386 return 0; 3474 return 0;
3387} 3475}
@@ -5349,10 +5437,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5437 switch_count = &prev->nvcsw;
5350 } 5438 }
5351 5439
5352#ifdef CONFIG_SMP 5440 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5441
5357 if (unlikely(!rq->nr_running)) 5442 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5443 idle_balance(cpu, rq);
@@ -5378,6 +5463,8 @@ need_resched_nonpreemptible:
5378 } else 5463 } else
5379 spin_unlock_irq(&rq->lock); 5464 spin_unlock_irq(&rq->lock);
5380 5465
5466 post_schedule(rq);
5467
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5468 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5469 goto need_resched_nonpreemptible;
5383 5470
@@ -6123,17 +6210,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6210 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6211 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6212 struct rq *rq;
6213 int reset_on_fork;
6126 6214
6127 /* may grab non-irq protected spin_locks */ 6215 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6216 BUG_ON(in_interrupt());
6129recheck: 6217recheck:
6130 /* double check policy once rq lock held */ 6218 /* double check policy once rq lock held */
6131 if (policy < 0) 6219 if (policy < 0) {
6220 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6221 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6222 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6223 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6224 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6225
6226 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6227 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6228 policy != SCHED_IDLE)
6229 return -EINVAL;
6230 }
6231
6137 /* 6232 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6233 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6234 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6272,10 @@ recheck:
6177 /* can't change other user's priorities */ 6272 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6273 if (!check_same_owner(p))
6179 return -EPERM; 6274 return -EPERM;
6275
6276 /* Normal users shall not reset the sched_reset_on_fork flag */
6277 if (p->sched_reset_on_fork && !reset_on_fork)
6278 return -EPERM;
6180 } 6279 }
6181 6280
6182 if (user) { 6281 if (user) {
@@ -6220,6 +6319,8 @@ recheck:
6220 if (running) 6319 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6320 p->sched_class->put_prev_task(rq, p);
6222 6321
6322 p->sched_reset_on_fork = reset_on_fork;
6323
6223 oldprio = p->prio; 6324 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6325 __setscheduler(rq, p, policy, param->sched_priority);
6225 6326
@@ -6336,14 +6437,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6437 if (p) {
6337 retval = security_task_getscheduler(p); 6438 retval = security_task_getscheduler(p);
6338 if (!retval) 6439 if (!retval)
6339 retval = p->policy; 6440 retval = p->policy
6441 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6442 }
6341 read_unlock(&tasklist_lock); 6443 read_unlock(&tasklist_lock);
6342 return retval; 6444 return retval;
6343} 6445}
6344 6446
6345/** 6447/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6448 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6449 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6450 * @param: structure containing the RT priority.
6349 */ 6451 */
@@ -6571,19 +6673,9 @@ static inline int should_resched(void)
6571 6673
6572static void __cond_resched(void) 6674static void __cond_resched(void)
6573{ 6675{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6676 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6677 schedule();
6576#endif 6678 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6679}
6588 6680
6589int __sched _cond_resched(void) 6681int __sched _cond_resched(void)
@@ -6597,14 +6689,14 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6689EXPORT_SYMBOL(_cond_resched);
6598 6690
6599/* 6691/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6692 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6693 * call schedule, and on return reacquire the lock.
6602 * 6694 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6695 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6696 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6697 * spin_unlock(), once by hand).
6606 */ 6698 */
6607int cond_resched_lock(spinlock_t *lock) 6699int __cond_resched_lock(spinlock_t *lock)
6608{ 6700{
6609 int resched = should_resched(); 6701 int resched = should_resched();
6610 int ret = 0; 6702 int ret = 0;
@@ -6620,9 +6712,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6712 }
6621 return ret; 6713 return ret;
6622} 6714}
6623EXPORT_SYMBOL(cond_resched_lock); 6715EXPORT_SYMBOL(__cond_resched_lock);
6624 6716
6625int __sched cond_resched_softirq(void) 6717int __sched __cond_resched_softirq(void)
6626{ 6718{
6627 BUG_ON(!in_softirq()); 6719 BUG_ON(!in_softirq());
6628 6720
@@ -6634,7 +6726,7 @@ int __sched cond_resched_softirq(void)
6634 } 6726 }
6635 return 0; 6727 return 0;
6636} 6728}
6637EXPORT_SYMBOL(cond_resched_softirq); 6729EXPORT_SYMBOL(__cond_resched_softirq);
6638 6730
6639/** 6731/**
6640 * yield - yield the current processor to other threads. 6732 * yield - yield the current processor to other threads.
@@ -6658,11 +6750,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6750 */
6659void __sched io_schedule(void) 6751void __sched io_schedule(void)
6660{ 6752{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6753 struct rq *rq = raw_rq();
6662 6754
6663 delayacct_blkio_start(); 6755 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6756 atomic_inc(&rq->nr_iowait);
6757 current->in_iowait = 1;
6665 schedule(); 6758 schedule();
6759 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6760 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6761 delayacct_blkio_end();
6668} 6762}
@@ -6670,12 +6764,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6764
6671long __sched io_schedule_timeout(long timeout) 6765long __sched io_schedule_timeout(long timeout)
6672{ 6766{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6767 struct rq *rq = raw_rq();
6674 long ret; 6768 long ret;
6675 6769
6676 delayacct_blkio_start(); 6770 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6771 atomic_inc(&rq->nr_iowait);
6772 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6773 ret = schedule_timeout(timeout);
6774 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6775 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6776 delayacct_blkio_end();
6681 return ret; 6777 return ret;
@@ -6992,8 +7088,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7088
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7089 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7090 /* Need help from migration thread: drop lock and wait. */
7091 struct task_struct *mt = rq->migration_thread;
7092
7093 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7094 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7095 wake_up_process(rq->migration_thread);
7096 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7097 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7098 tlb_migrate_finish(p->mm);
6999 return 0; 7099 return 0;
@@ -7625,7 +7725,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7725 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7726 register_cpu_notifier(&migration_notifier);
7627 7727
7628 return err; 7728 return 0;
7629} 7729}
7630early_initcall(migration_init); 7730early_initcall(migration_init);
7631#endif 7731#endif
@@ -7841,7 +7941,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7941 rq->rd = rd;
7842 7942
7843 cpumask_set_cpu(rq->cpu, rd->span); 7943 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7944 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7945 set_rq_online(rq);
7846 7946
7847 spin_unlock_irqrestore(&rq->lock, flags); 7947 spin_unlock_irqrestore(&rq->lock, flags);
@@ -8091,6 +8191,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8191 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8192};
8093 8193
8194struct s_data {
8195#ifdef CONFIG_NUMA
8196 int sd_allnodes;
8197 cpumask_var_t domainspan;
8198 cpumask_var_t covered;
8199 cpumask_var_t notcovered;
8200#endif
8201 cpumask_var_t nodemask;
8202 cpumask_var_t this_sibling_map;
8203 cpumask_var_t this_core_map;
8204 cpumask_var_t send_covered;
8205 cpumask_var_t tmpmask;
8206 struct sched_group **sched_group_nodes;
8207 struct root_domain *rd;
8208};
8209
8210enum s_alloc {
8211 sa_sched_groups = 0,
8212 sa_rootdomain,
8213 sa_tmpmask,
8214 sa_send_covered,
8215 sa_this_core_map,
8216 sa_this_sibling_map,
8217 sa_nodemask,
8218 sa_sched_group_nodes,
8219#ifdef CONFIG_NUMA
8220 sa_notcovered,
8221 sa_covered,
8222 sa_domainspan,
8223#endif
8224 sa_none,
8225};
8226
8094/* 8227/*
8095 * SMT sched-domains: 8228 * SMT sched-domains:
8096 */ 8229 */
@@ -8213,6 +8346,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8213 sg = sg->next; 8346 sg = sg->next;
8214 } while (sg != group_head); 8347 } while (sg != group_head);
8215} 8348}
8349
8350static int build_numa_sched_groups(struct s_data *d,
8351 const struct cpumask *cpu_map, int num)
8352{
8353 struct sched_domain *sd;
8354 struct sched_group *sg, *prev;
8355 int n, j;
8356
8357 cpumask_clear(d->covered);
8358 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8359 if (cpumask_empty(d->nodemask)) {
8360 d->sched_group_nodes[num] = NULL;
8361 goto out;
8362 }
8363
8364 sched_domain_node_span(num, d->domainspan);
8365 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8366
8367 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8368 GFP_KERNEL, num);
8369 if (!sg) {
8370 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8371 num);
8372 return -ENOMEM;
8373 }
8374 d->sched_group_nodes[num] = sg;
8375
8376 for_each_cpu(j, d->nodemask) {
8377 sd = &per_cpu(node_domains, j).sd;
8378 sd->groups = sg;
8379 }
8380
8381 sg->__cpu_power = 0;
8382 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8383 sg->next = sg;
8384 cpumask_or(d->covered, d->covered, d->nodemask);
8385
8386 prev = sg;
8387 for (j = 0; j < nr_node_ids; j++) {
8388 n = (num + j) % nr_node_ids;
8389 cpumask_complement(d->notcovered, d->covered);
8390 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8391 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8392 if (cpumask_empty(d->tmpmask))
8393 break;
8394 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8395 if (cpumask_empty(d->tmpmask))
8396 continue;
8397 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8398 GFP_KERNEL, num);
8399 if (!sg) {
8400 printk(KERN_WARNING
8401 "Can not alloc domain group for node %d\n", j);
8402 return -ENOMEM;
8403 }
8404 sg->__cpu_power = 0;
8405 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8406 sg->next = prev->next;
8407 cpumask_or(d->covered, d->covered, d->tmpmask);
8408 prev->next = sg;
8409 prev = sg;
8410 }
8411out:
8412 return 0;
8413}
8216#endif /* CONFIG_NUMA */ 8414#endif /* CONFIG_NUMA */
8217 8415
8218#ifdef CONFIG_NUMA 8416#ifdef CONFIG_NUMA
@@ -8378,280 +8576,285 @@ static void set_domain_attribute(struct sched_domain *sd,
8378 } 8576 }
8379} 8577}
8380 8578
8381/* 8579static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8580 const struct cpumask *cpu_map)
8383 * to the individual cpus 8581{
8384 */ 8582 switch (what) {
8385static int __build_sched_domains(const struct cpumask *cpu_map, 8583 case sa_sched_groups:
8386 struct sched_domain_attr *attr) 8584 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8387{ 8585 d->sched_group_nodes = NULL;
8388 int i, err = -ENOMEM; 8586 case sa_rootdomain:
8389 struct root_domain *rd; 8587 free_rootdomain(d->rd); /* fall through */
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, 8588 case sa_tmpmask:
8391 tmpmask; 8589 free_cpumask_var(d->tmpmask); /* fall through */
8590 case sa_send_covered:
8591 free_cpumask_var(d->send_covered); /* fall through */
8592 case sa_this_core_map:
8593 free_cpumask_var(d->this_core_map); /* fall through */
8594 case sa_this_sibling_map:
8595 free_cpumask_var(d->this_sibling_map); /* fall through */
8596 case sa_nodemask:
8597 free_cpumask_var(d->nodemask); /* fall through */
8598 case sa_sched_group_nodes:
8392#ifdef CONFIG_NUMA 8599#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered; 8600 kfree(d->sched_group_nodes); /* fall through */
8394 struct sched_group **sched_group_nodes = NULL; 8601 case sa_notcovered:
8395 int sd_allnodes = 0; 8602 free_cpumask_var(d->notcovered); /* fall through */
8396 8603 case sa_covered:
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8604 free_cpumask_var(d->covered); /* fall through */
8398 goto out; 8605 case sa_domainspan:
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8606 free_cpumask_var(d->domainspan); /* fall through */
8400 goto free_domainspan; 8607#endif
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8608 case sa_none:
8402 goto free_covered; 8609 break;
8403#endif 8610 }
8404 8611}
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415 8612
8613static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8614 const struct cpumask *cpu_map)
8615{
8416#ifdef CONFIG_NUMA 8616#ifdef CONFIG_NUMA
8417 /* 8617 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8618 return sa_none;
8419 */ 8619 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8620 return sa_domainspan;
8421 GFP_KERNEL); 8621 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8622 return sa_covered;
8623 /* Allocate the per-node list of sched groups */
8624 d->sched_group_nodes = kcalloc(nr_node_ids,
8625 sizeof(struct sched_group *), GFP_KERNEL);
8626 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8627 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8628 return sa_notcovered;
8425 } 8629 }
8426#endif 8630 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8631#endif
8428 rd = alloc_rootdomain(); 8632 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8633 return sa_sched_group_nodes;
8634 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8635 return sa_nodemask;
8636 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8637 return sa_this_sibling_map;
8638 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8639 return sa_this_core_map;
8640 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8641 return sa_send_covered;
8642 d->rd = alloc_rootdomain();
8643 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8644 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8645 return sa_tmpmask;
8432 } 8646 }
8647 return sa_rootdomain;
8648}
8433 8649
8650static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8651 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8652{
8653 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8654#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8655 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8656
8459 sd = &per_cpu(node_domains, i).sd; 8657 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8658 if (cpumask_weight(cpu_map) >
8659 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8660 sd = &per_cpu(allnodes_domains, i).sd;
8661 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8662 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8663 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8664 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8665 d->sd_allnodes = 1;
8465 p->child = sd; 8666 }
8466 cpumask_and(sched_domain_span(sd), 8667 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8668
8669 sd = &per_cpu(node_domains, i).sd;
8670 SD_INIT(sd, NODE);
8671 set_domain_attribute(sd, attr);
8672 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8673 sd->parent = parent;
8674 if (parent)
8675 parent->child = sd;
8676 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8677#endif
8678 return sd;
8679}
8469 8680
8470 p = sd; 8681static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8682 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8683 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8684{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8685 struct sched_domain *sd;
8475 sd->parent = p; 8686 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8687 SD_INIT(sd, CPU);
8477 p->child = sd; 8688 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8689 cpumask_copy(sched_domain_span(sd), d->nodemask);
8690 sd->parent = parent;
8691 if (parent)
8692 parent->child = sd;
8693 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8694 return sd;
8695}
8479 8696
8697static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8698 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8699 struct sched_domain *parent, int i)
8700{
8701 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8702#ifdef CONFIG_SCHED_MC
8481 p = sd; 8703 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8704 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8705 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8706 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8707 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8708 parent->child = sd;
8487 sd->parent = p; 8709 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8710#endif
8711 return sd;
8712}
8491 8713
8714static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8715 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8716 struct sched_domain *parent, int i)
8717{
8718 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8719#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8720 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8721 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8722 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8723 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8724 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8725 parent->child = sd;
8499 sd->parent = p; 8726 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8727#endif
8503 } 8728 return sd;
8729}
8504 8730
8731static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8732 const struct cpumask *cpu_map, int cpu)
8733{
8734 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8735#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8736 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8737 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8738 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8739 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8740 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8741 &cpu_to_cpu_group,
8512 8742 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8743 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8744#endif
8518
8519#ifdef CONFIG_SCHED_MC 8745#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8746 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8747 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8748 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8749 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8750 &cpu_to_core_group,
8525 8751 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8752 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8753#endif
8531 8754 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8755 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8756 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8757 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8758 &cpu_to_phys_group,
8536 continue; 8759 d->send_covered, d->tmpmask);
8537 8760 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8761#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8762 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8763 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8764 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8765 break;
8548 send_covered, tmpmask); 8766#endif
8767 default:
8768 break;
8549 } 8769 }
8770}
8550 8771
8551 for (i = 0; i < nr_node_ids; i++) { 8772/*
8552 /* Set up node groups */ 8773 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8774 * to the individual cpus
8554 int j; 8775 */
8555 8776static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8777 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8778{
8558 if (cpumask_empty(nodemask)) { 8779 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8780 struct s_data d;
8560 continue; 8781 struct sched_domain *sd;
8561 } 8782 int i;
8783#ifdef CONFIG_NUMA
8784 d.sd_allnodes = 0;
8785#endif
8562 8786
8563 sched_domain_node_span(i, domainspan); 8787 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8788 if (alloc_state != sa_rootdomain)
8789 goto error;
8790 alloc_state = sa_sched_groups;
8565 8791
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8792 /*
8567 GFP_KERNEL, i); 8793 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8794 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8795 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8796 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8797 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8798
8577 sd = &per_cpu(node_domains, j).sd; 8799 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8800 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8801 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8802 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8803 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8804
8586 for (j = 0; j < nr_node_ids; j++) { 8805 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8806 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8807 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8808 }
8588 8809
8589 cpumask_complement(notcovered, covered); 8810 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8811 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8812 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8813
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8814#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8815 /* Set up node groups */
8597 continue; 8816 if (d.sd_allnodes)
8817 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8818
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8819 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8820 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8821 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8822#endif
8616 8823
8617 /* Calculate CPU power for physical packages and nodes */ 8824 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8825#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8826 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8827 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8828 init_sched_groups_power(i, sd);
8623 } 8829 }
8624#endif 8830#endif
8625#ifdef CONFIG_SCHED_MC 8831#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8832 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8833 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8834 init_sched_groups_power(i, sd);
8630 } 8835 }
8631#endif 8836#endif
8632 8837
8633 for_each_cpu(i, cpu_map) { 8838 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8839 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8840 init_sched_groups_power(i, sd);
8637 } 8841 }
8638 8842
8639#ifdef CONFIG_NUMA 8843#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8844 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8845 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8846
8643 if (sd_allnodes) { 8847 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8848 struct sched_group *sg;
8645 8849
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8850 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8851 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8852 init_numa_sched_groups_power(sg);
8649 } 8853 }
8650#endif 8854#endif
8651 8855
8652 /* Attach the domains */ 8856 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8857 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8858#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8859 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8860#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8862,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8862#else
8660 sd = &per_cpu(phys_domains, i).sd; 8863 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8864#endif
8662 cpu_attach_domain(sd, rd, i); 8865 cpu_attach_domain(sd, d.rd, i);
8663 } 8866 }
8664 8867
8665 err = 0; 8868 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8869 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8870 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8871
8694#ifdef CONFIG_NUMA
8695error: 8872error:
8696 free_sched_groups(cpu_map, tmpmask); 8873 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8874 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8875}
8701 8876
8702static int build_sched_domains(const struct cpumask *cpu_map) 8877static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9479,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9479 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9480 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9481 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9482 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9483 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9484 */
9310 init_tg_cfs_entry(&init_task_group, 9485 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9486 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9487 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9488 root_task_group.se[i]);
9314 9489
@@ -9334,6 +9509,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9509#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9510 rq->sd = NULL;
9336 rq->rd = NULL; 9511 rq->rd = NULL;
9512 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9513 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9514 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9515 rq->push_cpu = 0;
@@ -9398,13 +9574,20 @@ void __init sched_init(void)
9398} 9574}
9399 9575
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9576#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9577static inline int preempt_count_equals(int preempt_offset)
9578{
9579 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9580
9581 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9582}
9583
9584void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9585{
9403#ifdef in_atomic 9586#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9587 static unsigned long prev_jiffy; /* ratelimiting */
9405 9588
9406 if ((!in_atomic() && !irqs_disabled()) || 9589 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9590 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9591 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9592 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9593 return;