aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2122
1 files changed, 1005 insertions, 1117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..5e43e9dc65d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
232#endif 231#endif
233 232
234/* 233/*
235 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
236 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
237 */ 236 */
238static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
294 * limitation from this.) 293 * limitation from this.)
295 */ 294 */
296#define MIN_SHARES 2 295#define MIN_SHARES 2
297#define MAX_SHARES (1UL << 18) 296#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
298 297
299static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
300#endif 299#endif
@@ -313,6 +312,9 @@ struct cfs_rq {
313 312
314 u64 exec_clock; 313 u64 exec_clock;
315 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
316 318
317 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
318 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -324,9 +326,11 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 326 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 327 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 328 */
327 struct sched_entity *curr, *next, *last; 329 struct sched_entity *curr, *next, *last, *skip;
328 330
331#ifdef CONFIG_SCHED_DEBUG
329 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
330 334
331#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
332 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -418,6 +422,7 @@ struct rt_rq {
418 */ 422 */
419struct root_domain { 423struct root_domain {
420 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
421 cpumask_var_t span; 426 cpumask_var_t span;
422 cpumask_var_t online; 427 cpumask_var_t online;
423 428
@@ -461,7 +466,7 @@ struct rq {
461 u64 nohz_stamp; 466 u64 nohz_stamp;
462 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
463#endif 468#endif
464 unsigned int skip_clock_update; 469 int skip_clock_update;
465 470
466 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
467 struct load_weight load; 472 struct load_weight load;
@@ -554,6 +559,10 @@ struct rq {
554 unsigned int ttwu_count; 559 unsigned int ttwu_count;
555 unsigned int ttwu_local; 560 unsigned int ttwu_local;
556#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
557}; 566};
558 567
559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
572 581
573#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
574 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
575 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
576 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
577 586
578/* 587/*
@@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
597 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
598 * 607 *
599 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
600 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
601 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
602 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
603 */ 612 */
@@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 615 struct task_group *tg;
607 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
608 617
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
614 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
615 621
616 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
646{ 652{
647 s64 delta; 653 s64 delta;
648 654
649 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
650 return; 656 return;
651 657
652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq)
664#endif 670#endif
665 671
666/** 672/**
667 * runqueue_is_locked 673 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
668 * @cpu: the processor in question. 674 * @cpu: the processor in question.
669 * 675 *
670 * Returns true if the current cpu runqueue is locked.
671 * This interface allows printk to be called with the runqueue lock 676 * This interface allows printk to be called with the runqueue lock
672 * held and know whether or not it is OK to wake up the klogd. 677 * held and know whether or not it is OK to wake up the klogd.
673 */ 678 */
@@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
843 return rq->curr == p; 848 return rq->curr == p;
844} 849}
845 850
846#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
848{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
849 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
850} 858}
851 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
852static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
853{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
854} 871}
855 872
856static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
857{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
858#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
859 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
860 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
870} 896}
871 897
872#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875#ifdef CONFIG_SMP
876 return p->oncpu;
877#else
878 return task_current(rq, p);
879#endif
880}
881
882static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
883{ 900{
884#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
887 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
888 * here. 905 * here.
889 */ 906 */
890 next->oncpu = 1; 907 next->on_cpu = 1;
891#endif 908#endif
892#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
893 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
900{ 917{
901#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
902 /* 919 /*
903 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
904 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
905 * finished. 922 * finished.
906 */ 923 */
907 smp_wmb(); 924 smp_wmb();
908 prev->oncpu = 0; 925 prev->on_cpu = 0;
909#endif 926#endif
910#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
911 local_irq_enable(); 928 local_irq_enable();
@@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
915 932
916/* 933/*
917 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
918 * against ttwu().
919 */
920static inline int task_is_waking(struct task_struct *p)
921{
922 return unlikely(p->state == TASK_WAKING);
923}
924
925/*
926 * __task_rq_lock - lock the runqueue a given task resides on.
927 * Must be called interrupts disabled.
928 */ 935 */
929static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
930 __acquires(rq->lock) 937 __acquires(rq->lock)
931{ 938{
932 struct rq *rq; 939 struct rq *rq;
933 940
941 lockdep_assert_held(&p->pi_lock);
942
934 for (;;) { 943 for (;;) {
935 rq = task_rq(p); 944 rq = task_rq(p);
936 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
941} 950}
942 951
943/* 952/*
944 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
945 * interrupts. Note the ordering: we can safely lookup the task_rq without
946 * explicitly disabling preemption.
947 */ 954 */
948static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
949 __acquires(rq->lock) 957 __acquires(rq->lock)
950{ 958{
951 struct rq *rq; 959 struct rq *rq;
952 960
953 for (;;) { 961 for (;;) {
954 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
955 rq = task_rq(p); 963 rq = task_rq(p);
956 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
957 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
958 return rq; 966 return rq;
959 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
960 } 969 }
961} 970}
962 971
@@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
966 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
967} 976}
968 977
969static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
970 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
971{ 982{
972 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
973} 985}
974 986
975/* 987/*
@@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void)
1198 int i; 1210 int i;
1199 struct sched_domain *sd; 1211 struct sched_domain *sd;
1200 1212
1213 rcu_read_lock();
1201 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1202 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1203 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1204 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1205 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1206 return cpu; 1224 return cpu;
1207} 1225}
1208/* 1226/*
@@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1312{ 1330{
1313 u64 tmp; 1331 u64 tmp;
1314 1332
1333 /*
1334 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1335 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1336 * 2^SCHED_LOAD_RESOLUTION.
1337 */
1338 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1339 tmp = (u64)delta_exec * scale_load_down(weight);
1340 else
1341 tmp = (u64)delta_exec;
1342
1315 if (!lw->inv_weight) { 1343 if (!lw->inv_weight) {
1316 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1344 unsigned long w = scale_load_down(lw->weight);
1345
1346 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1317 lw->inv_weight = 1; 1347 lw->inv_weight = 1;
1348 else if (unlikely(!w))
1349 lw->inv_weight = WMULT_CONST;
1318 else 1350 else
1319 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1351 lw->inv_weight = WMULT_CONST / w;
1320 / (lw->weight+1);
1321 } 1352 }
1322 1353
1323 tmp = (u64)delta_exec * weight;
1324 /* 1354 /*
1325 * Check whether we'd overflow the 64-bit multiplication: 1355 * Check whether we'd overflow the 64-bit multiplication:
1326 */ 1356 */
@@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1716 __release(rq2->lock);
1687} 1717}
1688 1718
1719#else /* CONFIG_SMP */
1720
1721/*
1722 * double_rq_lock - safely lock two runqueues
1723 *
1724 * Note this does not disable interrupts like task_rq_lock,
1725 * you need to do so manually before calling.
1726 */
1727static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1728 __acquires(rq1->lock)
1729 __acquires(rq2->lock)
1730{
1731 BUG_ON(!irqs_disabled());
1732 BUG_ON(rq1 != rq2);
1733 raw_spin_lock(&rq1->lock);
1734 __acquire(rq2->lock); /* Fake it out ;) */
1735}
1736
1737/*
1738 * double_rq_unlock - safely unlock two runqueues
1739 *
1740 * Note this does not restore interrupts like task_rq_unlock,
1741 * you need to do so manually after calling.
1742 */
1743static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1744 __releases(rq1->lock)
1745 __releases(rq2->lock)
1746{
1747 BUG_ON(rq1 != rq2);
1748 raw_spin_unlock(&rq1->lock);
1749 __release(rq2->lock);
1750}
1751
1689#endif 1752#endif
1690 1753
1691static void calc_load_account_idle(struct rq *this_rq); 1754static void calc_load_account_idle(struct rq *this_rq);
@@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
1727 1790
1728static void set_load_weight(struct task_struct *p) 1791static void set_load_weight(struct task_struct *p)
1729{ 1792{
1793 int prio = p->static_prio - MAX_RT_PRIO;
1794 struct load_weight *load = &p->se.load;
1795
1730 /* 1796 /*
1731 * SCHED_IDLE tasks get minimal weight: 1797 * SCHED_IDLE tasks get minimal weight:
1732 */ 1798 */
1733 if (p->policy == SCHED_IDLE) { 1799 if (p->policy == SCHED_IDLE) {
1734 p->se.load.weight = WEIGHT_IDLEPRIO; 1800 load->weight = scale_load(WEIGHT_IDLEPRIO);
1735 p->se.load.inv_weight = WMULT_IDLEPRIO; 1801 load->inv_weight = WMULT_IDLEPRIO;
1736 return; 1802 return;
1737 } 1803 }
1738 1804
1739 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1805 load->weight = scale_load(prio_to_weight[prio]);
1740 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1806 load->inv_weight = prio_to_wmult[prio];
1741} 1807}
1742 1808
1743static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1809static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1745 update_rq_clock(rq); 1811 update_rq_clock(rq);
1746 sched_info_queued(p); 1812 sched_info_queued(p);
1747 p->sched_class->enqueue_task(rq, p, flags); 1813 p->sched_class->enqueue_task(rq, p, flags);
1748 p->se.on_rq = 1;
1749} 1814}
1750 1815
1751static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1816static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1753 update_rq_clock(rq); 1818 update_rq_clock(rq);
1754 sched_info_dequeued(p); 1819 sched_info_dequeued(p);
1755 p->sched_class->dequeue_task(rq, p, flags); 1820 p->sched_class->dequeue_task(rq, p, flags);
1756 p->se.on_rq = 0;
1757} 1821}
1758 1822
1759/* 1823/*
@@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1944 */
1881 if (hardirq_count()) 1945 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1946 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1947 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1948 __this_cpu_add(cpu_softirq_time, delta);
1885 1949
1886 irq_time_write_end(); 1950 irq_time_write_end();
@@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1984 sched_rt_avg_update(rq, irq_delta);
1921} 1985}
1922 1986
1987static int irqtime_account_hi_update(void)
1988{
1989 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1990 unsigned long flags;
1991 u64 latest_ns;
1992 int ret = 0;
1993
1994 local_irq_save(flags);
1995 latest_ns = this_cpu_read(cpu_hardirq_time);
1996 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1997 ret = 1;
1998 local_irq_restore(flags);
1999 return ret;
2000}
2001
2002static int irqtime_account_si_update(void)
2003{
2004 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2005 unsigned long flags;
2006 u64 latest_ns;
2007 int ret = 0;
2008
2009 local_irq_save(flags);
2010 latest_ns = this_cpu_read(cpu_softirq_time);
2011 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2012 ret = 1;
2013 local_irq_restore(flags);
2014 return ret;
2015}
2016
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 2017#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 2018
2019#define sched_clock_irqtime (0)
2020
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 2021static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 2022{
1927 rq->clock_task += delta; 2023 rq->clock_task += delta;
@@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p)
2025 2121
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2122static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2123 const struct sched_class *prev_class,
2028 int oldprio, int running) 2124 int oldprio)
2029{ 2125{
2030 if (prev_class != p->sched_class) { 2126 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2127 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2128 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2129 p->sched_class->switched_to(rq, p);
2034 } else 2130 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2131 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2132}
2037 2133
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2134static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2056 * A queue event has occurred, and we're going to schedule. In 2152 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update. 2153 * this case, we can save a useless back to back clock update.
2058 */ 2154 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2155 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1; 2156 rq->skip_clock_update = 1;
2061} 2157}
2062 2158
@@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2102 */ 2198 */
2103 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2199 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2104 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201
2202#ifdef CONFIG_LOCKDEP
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock)));
2205#endif
2105#endif 2206#endif
2106 2207
2107 trace_sched_migrate_task(p, new_cpu); 2208 trace_sched_migrate_task(p, new_cpu);
@@ -2122,19 +2223,6 @@ struct migration_arg {
2122static int migration_cpu_stop(void *data); 2223static int migration_cpu_stop(void *data);
2123 2224
2124/* 2225/*
2125 * The task's runqueue lock must be held.
2126 * Returns true if you have to wait for migration thread.
2127 */
2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2129{
2130 /*
2131 * If the task is not on a runqueue (and not running), then
2132 * the next wake-up will properly place the task.
2133 */
2134 return p->se.on_rq || task_running(rq, p);
2135}
2136
2137/*
2138 * wait_task_inactive - wait for a thread to unschedule. 2226 * wait_task_inactive - wait for a thread to unschedule.
2139 * 2227 *
2140 * If @match_state is nonzero, it's the @p->state value just checked and 2228 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2191 rq = task_rq_lock(p, &flags); 2279 rq = task_rq_lock(p, &flags);
2192 trace_sched_wait_task(p); 2280 trace_sched_wait_task(p);
2193 running = task_running(rq, p); 2281 running = task_running(rq, p);
2194 on_rq = p->se.on_rq; 2282 on_rq = p->on_rq;
2195 ncsw = 0; 2283 ncsw = 0;
2196 if (!match_state || p->state == match_state) 2284 if (!match_state || p->state == match_state)
2197 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2285 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2198 task_rq_unlock(rq, &flags); 2286 task_rq_unlock(rq, p, &flags);
2199 2287
2200 /* 2288 /*
2201 * If it changed from the expected state, bail out now. 2289 * If it changed from the expected state, bail out now.
@@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2312 * yield - it could be a while.
2225 */ 2313 */
2226 if (unlikely(on_rq)) { 2314 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2315 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2316
2317 set_current_state(TASK_UNINTERRUPTIBLE);
2318 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2319 continue;
2229 } 2320 }
2230 2321
@@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2246 * Cause a process which is running on another CPU to enter 2337 * Cause a process which is running on another CPU to enter
2247 * kernel-mode, without any delay. (to get signals handled.) 2338 * kernel-mode, without any delay. (to get signals handled.)
2248 * 2339 *
2249 * NOTE: this function doesnt have to take the runqueue lock, 2340 * NOTE: this function doesn't have to take the runqueue lock,
2250 * because all it wants to ensure is that the remote task enters 2341 * because all it wants to ensure is that the remote task enters
2251 * the kernel. If the IPI races and the task has been migrated 2342 * the kernel. If the IPI races and the task has been migrated
2252 * to another CPU then no harm is done and the purpose has been 2343 * to another CPU then no harm is done and the purpose has been
@@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2356EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2357#endif /* CONFIG_SMP */
2267 2358
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2359#ifdef CONFIG_SMP
2290/* 2360/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2361 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2292 */ 2362 */
2293static int select_fallback_rq(int cpu, struct task_struct *p) 2363static int select_fallback_rq(int cpu, struct task_struct *p)
2294{ 2364{
@@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2321} 2391}
2322 2392
2323/* 2393/*
2324 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2394 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2325 */ 2395 */
2326static inline 2396static inline
2327int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2397int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2328{ 2398{
2329 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2399 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2330 2400
2331 /* 2401 /*
2332 * In order not to call set_task_cpu() on a blocking task we need 2402 * In order not to call set_task_cpu() on a blocking task we need
@@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
2352} 2422}
2353#endif 2423#endif
2354 2424
2355static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2425static void
2356 bool is_sync, bool is_migrate, bool is_local, 2426ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2357 unsigned long en_flags)
2358{ 2427{
2428#ifdef CONFIG_SCHEDSTATS
2429 struct rq *rq = this_rq();
2430
2431#ifdef CONFIG_SMP
2432 int this_cpu = smp_processor_id();
2433
2434 if (cpu == this_cpu) {
2435 schedstat_inc(rq, ttwu_local);
2436 schedstat_inc(p, se.statistics.nr_wakeups_local);
2437 } else {
2438 struct sched_domain *sd;
2439
2440 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2441 rcu_read_lock();
2442 for_each_domain(this_cpu, sd) {
2443 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2444 schedstat_inc(sd, ttwu_wake_remote);
2445 break;
2446 }
2447 }
2448 rcu_read_unlock();
2449 }
2450#endif /* CONFIG_SMP */
2451
2452 schedstat_inc(rq, ttwu_count);
2359 schedstat_inc(p, se.statistics.nr_wakeups); 2453 schedstat_inc(p, se.statistics.nr_wakeups);
2360 if (is_sync) 2454
2455 if (wake_flags & WF_SYNC)
2361 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2456 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2362 if (is_migrate) 2457
2458 if (cpu != task_cpu(p))
2363 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2364 if (is_local)
2365 schedstat_inc(p, se.statistics.nr_wakeups_local);
2366 else
2367 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2368 2460
2461#endif /* CONFIG_SCHEDSTATS */
2462}
2463
2464static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2465{
2369 activate_task(rq, p, en_flags); 2466 activate_task(rq, p, en_flags);
2467 p->on_rq = 1;
2468
2469 /* if a worker is waking up, notify workqueue */
2470 if (p->flags & PF_WQ_WORKER)
2471 wq_worker_waking_up(p, cpu_of(rq));
2370} 2472}
2371 2473
2372static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2474/*
2373 int wake_flags, bool success) 2475 * Mark the task runnable and perform wakeup-preemption.
2476 */
2477static void
2478ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2374{ 2479{
2375 trace_sched_wakeup(p, success); 2480 trace_sched_wakeup(p, true);
2376 check_preempt_curr(rq, p, wake_flags); 2481 check_preempt_curr(rq, p, wake_flags);
2377 2482
2378 p->state = TASK_RUNNING; 2483 p->state = TASK_RUNNING;
@@ -2391,9 +2496,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2391 rq->idle_stamp = 0; 2496 rq->idle_stamp = 0;
2392 } 2497 }
2393#endif 2498#endif
2394 /* if a worker is waking up, notify workqueue */ 2499}
2395 if ((p->flags & PF_WQ_WORKER) && success) 2500
2396 wq_worker_waking_up(p, cpu_of(rq)); 2501static void
2502ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2503{
2504#ifdef CONFIG_SMP
2505 if (p->sched_contributes_to_load)
2506 rq->nr_uninterruptible--;
2507#endif
2508
2509 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2510 ttwu_do_wakeup(rq, p, wake_flags);
2511}
2512
2513/*
2514 * Called in case the task @p isn't fully descheduled from its runqueue,
2515 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2516 * since all we need to do is flip p->state to TASK_RUNNING, since
2517 * the task is still ->on_rq.
2518 */
2519static int ttwu_remote(struct task_struct *p, int wake_flags)
2520{
2521 struct rq *rq;
2522 int ret = 0;
2523
2524 rq = __task_rq_lock(p);
2525 if (p->on_rq) {
2526 ttwu_do_wakeup(rq, p, wake_flags);
2527 ret = 1;
2528 }
2529 __task_rq_unlock(rq);
2530
2531 return ret;
2532}
2533
2534#ifdef CONFIG_SMP
2535static void sched_ttwu_pending(void)
2536{
2537 struct rq *rq = this_rq();
2538 struct task_struct *list = xchg(&rq->wake_list, NULL);
2539
2540 if (!list)
2541 return;
2542
2543 raw_spin_lock(&rq->lock);
2544
2545 while (list) {
2546 struct task_struct *p = list;
2547 list = list->wake_entry;
2548 ttwu_do_activate(rq, p, 0);
2549 }
2550
2551 raw_spin_unlock(&rq->lock);
2552}
2553
2554void scheduler_ipi(void)
2555{
2556 sched_ttwu_pending();
2557}
2558
2559static void ttwu_queue_remote(struct task_struct *p, int cpu)
2560{
2561 struct rq *rq = cpu_rq(cpu);
2562 struct task_struct *next = rq->wake_list;
2563
2564 for (;;) {
2565 struct task_struct *old = next;
2566
2567 p->wake_entry = next;
2568 next = cmpxchg(&rq->wake_list, old, p);
2569 if (next == old)
2570 break;
2571 }
2572
2573 if (!next)
2574 smp_send_reschedule(cpu);
2575}
2576#endif
2577
2578static void ttwu_queue(struct task_struct *p, int cpu)
2579{
2580 struct rq *rq = cpu_rq(cpu);
2581
2582#if defined(CONFIG_SMP)
2583 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2584 ttwu_queue_remote(p, cpu);
2585 return;
2586 }
2587#endif
2588
2589 raw_spin_lock(&rq->lock);
2590 ttwu_do_activate(rq, p, 0);
2591 raw_spin_unlock(&rq->lock);
2397} 2592}
2398 2593
2399/** 2594/**
@@ -2411,92 +2606,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2411 * Returns %true if @p was woken up, %false if it was already running 2606 * Returns %true if @p was woken up, %false if it was already running
2412 * or @state didn't match @p's state. 2607 * or @state didn't match @p's state.
2413 */ 2608 */
2414static int try_to_wake_up(struct task_struct *p, unsigned int state, 2609static int
2415 int wake_flags) 2610try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2416{ 2611{
2417 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2612 unsigned long flags;
2419 unsigned long en_flags = ENQUEUE_WAKEUP; 2613 int cpu, success = 0;
2420 struct rq *rq;
2421
2422 this_cpu = get_cpu();
2423 2614
2424 smp_wmb(); 2615 smp_wmb();
2425 rq = task_rq_lock(p, &flags); 2616 raw_spin_lock_irqsave(&p->pi_lock, flags);
2426 if (!(p->state & state)) 2617 if (!(p->state & state))
2427 goto out; 2618 goto out;
2428 2619
2429 if (p->se.on_rq) 2620 success = 1; /* we're going to change ->state */
2430 goto out_running;
2431
2432 cpu = task_cpu(p); 2621 cpu = task_cpu(p);
2433 orig_cpu = cpu;
2434 2622
2435#ifdef CONFIG_SMP 2623 if (p->on_rq && ttwu_remote(p, wake_flags))
2436 if (unlikely(task_running(rq, p))) 2624 goto stat;
2437 goto out_activate;
2438 2625
2626#ifdef CONFIG_SMP
2439 /* 2627 /*
2440 * In order to handle concurrent wakeups and release the rq->lock 2628 * If the owning (remote) cpu is still in the middle of schedule() with
2441 * we put the task in TASK_WAKING state. 2629 * this task as prev, wait until its done referencing the task.
2442 *
2443 * First fix up the nr_uninterruptible count:
2444 */ 2630 */
2445 if (task_contributes_to_load(p)) { 2631 while (p->on_cpu) {
2446 if (likely(cpu_online(orig_cpu))) 2632#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2447 rq->nr_uninterruptible--; 2633 /*
2448 else 2634 * If called from interrupt context we could have landed in the
2449 this_rq()->nr_uninterruptible--; 2635 * middle of schedule(), in this case we should take care not
2450 } 2636 * to spin on ->on_cpu if p is current, since that would
2451 p->state = TASK_WAKING; 2637 * deadlock.
2452 2638 */
2453 if (p->sched_class->task_waking) { 2639 if (p == current) {
2454 p->sched_class->task_waking(rq, p); 2640 ttwu_queue(p, cpu);
2455 en_flags |= ENQUEUE_WAKING; 2641 goto stat;
2642 }
2643#endif
2644 cpu_relax();
2456 } 2645 }
2457
2458 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2459 if (cpu != orig_cpu)
2460 set_task_cpu(p, cpu);
2461 __task_rq_unlock(rq);
2462
2463 rq = cpu_rq(cpu);
2464 raw_spin_lock(&rq->lock);
2465
2466 /* 2646 /*
2467 * We migrated the task without holding either rq->lock, however 2647 * Pairs with the smp_wmb() in finish_lock_switch().
2468 * since the task is not on the task list itself, nobody else
2469 * will try and migrate the task, hence the rq should match the
2470 * cpu we just moved it to.
2471 */ 2648 */
2472 WARN_ON(task_cpu(p) != cpu); 2649 smp_rmb();
2473 WARN_ON(p->state != TASK_WAKING);
2474 2650
2475#ifdef CONFIG_SCHEDSTATS 2651 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2476 schedstat_inc(rq, ttwu_count); 2652 p->state = TASK_WAKING;
2477 if (cpu == this_cpu)
2478 schedstat_inc(rq, ttwu_local);
2479 else {
2480 struct sched_domain *sd;
2481 for_each_domain(this_cpu, sd) {
2482 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2483 schedstat_inc(sd, ttwu_wake_remote);
2484 break;
2485 }
2486 }
2487 }
2488#endif /* CONFIG_SCHEDSTATS */
2489 2653
2490out_activate: 2654 if (p->sched_class->task_waking)
2655 p->sched_class->task_waking(p);
2656
2657 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2658 if (task_cpu(p) != cpu)
2659 set_task_cpu(p, cpu);
2491#endif /* CONFIG_SMP */ 2660#endif /* CONFIG_SMP */
2492 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2661
2493 cpu == this_cpu, en_flags); 2662 ttwu_queue(p, cpu);
2494 success = 1; 2663stat:
2495out_running: 2664 ttwu_stat(p, cpu, wake_flags);
2496 ttwu_post_activation(p, rq, wake_flags, success);
2497out: 2665out:
2498 task_rq_unlock(rq, &flags); 2666 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2499 put_cpu();
2500 2667
2501 return success; 2668 return success;
2502} 2669}
@@ -2505,31 +2672,34 @@ out:
2505 * try_to_wake_up_local - try to wake up a local task with rq lock held 2672 * try_to_wake_up_local - try to wake up a local task with rq lock held
2506 * @p: the thread to be awakened 2673 * @p: the thread to be awakened
2507 * 2674 *
2508 * Put @p on the run-queue if it's not already there. The caller must 2675 * Put @p on the run-queue if it's not already there. The caller must
2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2676 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2510 * the current task. this_rq() stays locked over invocation. 2677 * the current task.
2511 */ 2678 */
2512static void try_to_wake_up_local(struct task_struct *p) 2679static void try_to_wake_up_local(struct task_struct *p)
2513{ 2680{
2514 struct rq *rq = task_rq(p); 2681 struct rq *rq = task_rq(p);
2515 bool success = false;
2516 2682
2517 BUG_ON(rq != this_rq()); 2683 BUG_ON(rq != this_rq());
2518 BUG_ON(p == current); 2684 BUG_ON(p == current);
2519 lockdep_assert_held(&rq->lock); 2685 lockdep_assert_held(&rq->lock);
2520 2686
2687 if (!raw_spin_trylock(&p->pi_lock)) {
2688 raw_spin_unlock(&rq->lock);
2689 raw_spin_lock(&p->pi_lock);
2690 raw_spin_lock(&rq->lock);
2691 }
2692
2521 if (!(p->state & TASK_NORMAL)) 2693 if (!(p->state & TASK_NORMAL))
2522 return; 2694 goto out;
2523 2695
2524 if (!p->se.on_rq) { 2696 if (!p->on_rq)
2525 if (likely(!task_running(rq, p))) { 2697 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2526 schedstat_inc(rq, ttwu_count); 2698
2527 schedstat_inc(rq, ttwu_local); 2699 ttwu_do_wakeup(rq, p, 0);
2528 } 2700 ttwu_stat(p, smp_processor_id(), 0);
2529 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2701out:
2530 success = true; 2702 raw_spin_unlock(&p->pi_lock);
2531 }
2532 ttwu_post_activation(p, rq, 0, success);
2533} 2703}
2534 2704
2535/** 2705/**
@@ -2562,18 +2732,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2562 */ 2732 */
2563static void __sched_fork(struct task_struct *p) 2733static void __sched_fork(struct task_struct *p)
2564{ 2734{
2735 p->on_rq = 0;
2736
2737 p->se.on_rq = 0;
2565 p->se.exec_start = 0; 2738 p->se.exec_start = 0;
2566 p->se.sum_exec_runtime = 0; 2739 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2740 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2741 p->se.nr_migrations = 0;
2742 p->se.vruntime = 0;
2743 INIT_LIST_HEAD(&p->se.group_node);
2569 2744
2570#ifdef CONFIG_SCHEDSTATS 2745#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2746 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2572#endif 2747#endif
2573 2748
2574 INIT_LIST_HEAD(&p->rt.run_list); 2749 INIT_LIST_HEAD(&p->rt.run_list);
2575 p->se.on_rq = 0;
2576 INIT_LIST_HEAD(&p->se.group_node);
2577 2750
2578#ifdef CONFIG_PREEMPT_NOTIFIERS 2751#ifdef CONFIG_PREEMPT_NOTIFIERS
2579 INIT_HLIST_HEAD(&p->preempt_notifiers); 2752 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2583,8 +2756,9 @@ static void __sched_fork(struct task_struct *p)
2583/* 2756/*
2584 * fork()/clone()-time setup: 2757 * fork()/clone()-time setup:
2585 */ 2758 */
2586void sched_fork(struct task_struct *p, int clone_flags) 2759void sched_fork(struct task_struct *p)
2587{ 2760{
2761 unsigned long flags;
2588 int cpu = get_cpu(); 2762 int cpu = get_cpu();
2589 2763
2590 __sched_fork(p); 2764 __sched_fork(p);
@@ -2635,16 +2809,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2635 * 2809 *
2636 * Silence PROVE_RCU. 2810 * Silence PROVE_RCU.
2637 */ 2811 */
2638 rcu_read_lock(); 2812 raw_spin_lock_irqsave(&p->pi_lock, flags);
2639 set_task_cpu(p, cpu); 2813 set_task_cpu(p, cpu);
2640 rcu_read_unlock(); 2814 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2641 2815
2642#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2816#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2643 if (likely(sched_info_on())) 2817 if (likely(sched_info_on()))
2644 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2818 memset(&p->sched_info, 0, sizeof(p->sched_info));
2645#endif 2819#endif
2646#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2820#if defined(CONFIG_SMP)
2647 p->oncpu = 0; 2821 p->on_cpu = 0;
2648#endif 2822#endif
2649#ifdef CONFIG_PREEMPT 2823#ifdef CONFIG_PREEMPT
2650 /* Want to start with kernel preemption disabled. */ 2824 /* Want to start with kernel preemption disabled. */
@@ -2664,41 +2838,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2664 * that must be done for every newly created context, then puts the task 2838 * that must be done for every newly created context, then puts the task
2665 * on the runqueue and wakes it. 2839 * on the runqueue and wakes it.
2666 */ 2840 */
2667void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2841void wake_up_new_task(struct task_struct *p)
2668{ 2842{
2669 unsigned long flags; 2843 unsigned long flags;
2670 struct rq *rq; 2844 struct rq *rq;
2671 int cpu __maybe_unused = get_cpu();
2672 2845
2846 raw_spin_lock_irqsave(&p->pi_lock, flags);
2673#ifdef CONFIG_SMP 2847#ifdef CONFIG_SMP
2674 rq = task_rq_lock(p, &flags);
2675 p->state = TASK_WAKING;
2676
2677 /* 2848 /*
2678 * Fork balancing, do it here and not earlier because: 2849 * Fork balancing, do it here and not earlier because:
2679 * - cpus_allowed can change in the fork path 2850 * - cpus_allowed can change in the fork path
2680 * - any previously selected cpu might disappear through hotplug 2851 * - any previously selected cpu might disappear through hotplug
2681 *
2682 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2683 * without people poking at ->cpus_allowed.
2684 */ 2852 */
2685 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2853 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2686 set_task_cpu(p, cpu);
2687
2688 p->state = TASK_RUNNING;
2689 task_rq_unlock(rq, &flags);
2690#endif 2854#endif
2691 2855
2692 rq = task_rq_lock(p, &flags); 2856 rq = __task_rq_lock(p);
2693 activate_task(rq, p, 0); 2857 activate_task(rq, p, 0);
2694 trace_sched_wakeup_new(p, 1); 2858 p->on_rq = 1;
2859 trace_sched_wakeup_new(p, true);
2695 check_preempt_curr(rq, p, WF_FORK); 2860 check_preempt_curr(rq, p, WF_FORK);
2696#ifdef CONFIG_SMP 2861#ifdef CONFIG_SMP
2697 if (p->sched_class->task_woken) 2862 if (p->sched_class->task_woken)
2698 p->sched_class->task_woken(rq, p); 2863 p->sched_class->task_woken(rq, p);
2699#endif 2864#endif
2700 task_rq_unlock(rq, &flags); 2865 task_rq_unlock(rq, p, &flags);
2701 put_cpu();
2702} 2866}
2703 2867
2704#ifdef CONFIG_PREEMPT_NOTIFIERS 2868#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2776,9 +2940,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2940prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2941 struct task_struct *next)
2778{ 2942{
2943 sched_info_switch(prev, next);
2944 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2945 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2946 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2947 prepare_arch_switch(next);
2948 trace_sched_switch(prev, next);
2782} 2949}
2783 2950
2784/** 2951/**
@@ -2911,7 +3078,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 3078 struct mm_struct *mm, *oldmm;
2912 3079
2913 prepare_task_switch(rq, prev, next); 3080 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 3081
2915 mm = next->mm; 3082 mm = next->mm;
2916 oldmm = prev->active_mm; 3083 oldmm = prev->active_mm;
2917 /* 3084 /*
@@ -3404,27 +3571,22 @@ void sched_exec(void)
3404{ 3571{
3405 struct task_struct *p = current; 3572 struct task_struct *p = current;
3406 unsigned long flags; 3573 unsigned long flags;
3407 struct rq *rq;
3408 int dest_cpu; 3574 int dest_cpu;
3409 3575
3410 rq = task_rq_lock(p, &flags); 3576 raw_spin_lock_irqsave(&p->pi_lock, flags);
3411 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3577 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3412 if (dest_cpu == smp_processor_id()) 3578 if (dest_cpu == smp_processor_id())
3413 goto unlock; 3579 goto unlock;
3414 3580
3415 /* 3581 if (likely(cpu_active(dest_cpu))) {
3416 * select_task_rq() can race against ->cpus_allowed
3417 */
3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3420 struct migration_arg arg = { p, dest_cpu }; 3582 struct migration_arg arg = { p, dest_cpu };
3421 3583
3422 task_rq_unlock(rq, &flags); 3584 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3423 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3585 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3424 return; 3586 return;
3425 } 3587 }
3426unlock: 3588unlock:
3427 task_rq_unlock(rq, &flags); 3589 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3428} 3590}
3429 3591
3430#endif 3592#endif
@@ -3461,7 +3623,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3461 3623
3462 rq = task_rq_lock(p, &flags); 3624 rq = task_rq_lock(p, &flags);
3463 ns = do_task_delta_exec(p, rq); 3625 ns = do_task_delta_exec(p, rq);
3464 task_rq_unlock(rq, &flags); 3626 task_rq_unlock(rq, p, &flags);
3465 3627
3466 return ns; 3628 return ns;
3467} 3629}
@@ -3479,7 +3641,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3479 3641
3480 rq = task_rq_lock(p, &flags); 3642 rq = task_rq_lock(p, &flags);
3481 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3643 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3482 task_rq_unlock(rq, &flags); 3644 task_rq_unlock(rq, p, &flags);
3483 3645
3484 return ns; 3646 return ns;
3485} 3647}
@@ -3503,7 +3665,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3503 rq = task_rq_lock(p, &flags); 3665 rq = task_rq_lock(p, &flags);
3504 thread_group_cputime(p, &totals); 3666 thread_group_cputime(p, &totals);
3505 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3667 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3506 task_rq_unlock(rq, &flags); 3668 task_rq_unlock(rq, p, &flags);
3507 3669
3508 return ns; 3670 return ns;
3509} 3671}
@@ -3568,6 +3730,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3730}
3569 3731
3570/* 3732/*
3733 * Account system cpu time to a process and desired cpustat field
3734 * @p: the process that the cpu time gets accounted to
3735 * @cputime: the cpu time spent in kernel space since the last update
3736 * @cputime_scaled: cputime scaled by cpu frequency
3737 * @target_cputime64: pointer to cpustat field that has to be updated
3738 */
3739static inline
3740void __account_system_time(struct task_struct *p, cputime_t cputime,
3741 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3742{
3743 cputime64_t tmp = cputime_to_cputime64(cputime);
3744
3745 /* Add system time to process. */
3746 p->stime = cputime_add(p->stime, cputime);
3747 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3748 account_group_system_time(p, cputime);
3749
3750 /* Add system time to cpustat. */
3751 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3752 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3753
3754 /* Account for system time used */
3755 acct_update_integrals(p);
3756}
3757
3758/*
3571 * Account system cpu time to a process. 3759 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3760 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3761 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3766,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3766 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3767{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3768 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3769 cputime64_t *target_cputime64;
3582 3770
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3771 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3772 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3773 return;
3586 } 3774 }
3587 3775
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3776 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3777 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3778 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3779 target_cputime64 = &cpustat->softirq;
3599 else 3780 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3781 target_cputime64 = &cpustat->system;
3601 3782
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3783 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3784}
3607 3785
3608/* 3786/*
3609 * Account for involuntary wait time. 3787 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3788 * @cputime: the cpu time spent in involuntary wait
3611 */ 3789 */
3612void account_steal_time(cputime_t cputime) 3790void account_steal_time(cputime_t cputime)
3613{ 3791{
@@ -3635,6 +3813,73 @@ void account_idle_time(cputime_t cputime)
3635 3813
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3814#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3815
3816#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3817/*
3818 * Account a tick to a process and cpustat
3819 * @p: the process that the cpu time gets accounted to
3820 * @user_tick: is the tick from userspace
3821 * @rq: the pointer to rq
3822 *
3823 * Tick demultiplexing follows the order
3824 * - pending hardirq update
3825 * - pending softirq update
3826 * - user_time
3827 * - idle_time
3828 * - system time
3829 * - check for guest_time
3830 * - else account as system_time
3831 *
3832 * Check for hardirq is done both for system and user time as there is
3833 * no timer going off while we are on hardirq and hence we may never get an
3834 * opportunity to update it solely in system time.
3835 * p->stime and friends are only updated on system time and not on irq
3836 * softirq as those do not count in task exec_runtime any more.
3837 */
3838static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3839 struct rq *rq)
3840{
3841 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3842 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3843 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3844
3845 if (irqtime_account_hi_update()) {
3846 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3847 } else if (irqtime_account_si_update()) {
3848 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3849 } else if (this_cpu_ksoftirqd() == p) {
3850 /*
3851 * ksoftirqd time do not get accounted in cpu_softirq_time.
3852 * So, we have to handle it separately here.
3853 * Also, p->stime needs to be updated for ksoftirqd.
3854 */
3855 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3856 &cpustat->softirq);
3857 } else if (user_tick) {
3858 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3859 } else if (p == rq->idle) {
3860 account_idle_time(cputime_one_jiffy);
3861 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3862 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3863 } else {
3864 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3865 &cpustat->system);
3866 }
3867}
3868
3869static void irqtime_account_idle_ticks(int ticks)
3870{
3871 int i;
3872 struct rq *rq = this_rq();
3873
3874 for (i = 0; i < ticks; i++)
3875 irqtime_account_process_tick(current, 0, rq);
3876}
3877#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3878static void irqtime_account_idle_ticks(int ticks) {}
3879static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3880 struct rq *rq) {}
3881#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3882
3638/* 3883/*
3639 * Account a single tick of cpu time. 3884 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3885 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3890,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3890 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3891 struct rq *rq = this_rq();
3647 3892
3893 if (sched_clock_irqtime) {
3894 irqtime_account_process_tick(p, user_tick, rq);
3895 return;
3896 }
3897
3648 if (user_tick) 3898 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3899 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3900 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3920,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3920 */
3671void account_idle_ticks(unsigned long ticks) 3921void account_idle_ticks(unsigned long ticks)
3672{ 3922{
3923
3924 if (sched_clock_irqtime) {
3925 irqtime_account_idle_ticks(ticks);
3926 return;
3927 }
3928
3673 account_idle_time(jiffies_to_cputime(ticks)); 3929 account_idle_time(jiffies_to_cputime(ticks));
3674} 3930}
3675 3931
@@ -3763,9 +4019,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3763/* 4019/*
3764 * This function gets called by the timer code, with HZ frequency. 4020 * This function gets called by the timer code, with HZ frequency.
3765 * We call it with interrupts disabled. 4021 * We call it with interrupts disabled.
3766 *
3767 * It also gets called by the fork code, when changing the parent's
3768 * timeslices.
3769 */ 4022 */
3770void scheduler_tick(void) 4023void scheduler_tick(void)
3771{ 4024{
@@ -3885,17 +4138,11 @@ static inline void schedule_debug(struct task_struct *prev)
3885 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4138 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3886 4139
3887 schedstat_inc(this_rq(), sched_count); 4140 schedstat_inc(this_rq(), sched_count);
3888#ifdef CONFIG_SCHEDSTATS
3889 if (unlikely(prev->lock_depth >= 0)) {
3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3891 schedstat_inc(prev, sched_info.bkl_count);
3892 }
3893#endif
3894} 4141}
3895 4142
3896static void put_prev_task(struct rq *rq, struct task_struct *prev) 4143static void put_prev_task(struct rq *rq, struct task_struct *prev)
3897{ 4144{
3898 if (prev->se.on_rq) 4145 if (prev->on_rq || rq->skip_clock_update < 0)
3899 update_rq_clock(rq); 4146 update_rq_clock(rq);
3900 prev->sched_class->put_prev_task(rq, prev); 4147 prev->sched_class->put_prev_task(rq, prev);
3901} 4148}
@@ -3945,9 +4192,6 @@ need_resched:
3945 rcu_note_context_switch(cpu); 4192 rcu_note_context_switch(cpu);
3946 prev = rq->curr; 4193 prev = rq->curr;
3947 4194
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev); 4195 schedule_debug(prev);
3952 4196
3953 if (sched_feat(HRTICK)) 4197 if (sched_feat(HRTICK))
@@ -3960,11 +4204,13 @@ need_resched_nonpreemptible:
3960 if (unlikely(signal_pending_state(prev->state, prev))) { 4204 if (unlikely(signal_pending_state(prev->state, prev))) {
3961 prev->state = TASK_RUNNING; 4205 prev->state = TASK_RUNNING;
3962 } else { 4206 } else {
4207 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4208 prev->on_rq = 0;
4209
3963 /* 4210 /*
3964 * If a worker is going to sleep, notify and 4211 * If a worker went to sleep, notify and ask workqueue
3965 * ask workqueue whether it wants to wake up a 4212 * whether it wants to wake up a task to maintain
3966 * task to maintain concurrency. If so, wake 4213 * concurrency.
3967 * up the task.
3968 */ 4214 */
3969 if (prev->flags & PF_WQ_WORKER) { 4215 if (prev->flags & PF_WQ_WORKER) {
3970 struct task_struct *to_wakeup; 4216 struct task_struct *to_wakeup;
@@ -3973,7 +4219,16 @@ need_resched_nonpreemptible:
3973 if (to_wakeup) 4219 if (to_wakeup)
3974 try_to_wake_up_local(to_wakeup); 4220 try_to_wake_up_local(to_wakeup);
3975 } 4221 }
3976 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4222
4223 /*
4224 * If we are going to sleep and we have plugged IO
4225 * queued, make sure to submit it to avoid deadlocks.
4226 */
4227 if (blk_needs_flush_plug(prev)) {
4228 raw_spin_unlock(&rq->lock);
4229 blk_schedule_flush_plug(prev);
4230 raw_spin_lock(&rq->lock);
4231 }
3977 } 4232 }
3978 switch_count = &prev->nvcsw; 4233 switch_count = &prev->nvcsw;
3979 } 4234 }
@@ -3989,9 +4244,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4244 rq->skip_clock_update = 0;
3990 4245
3991 if (likely(prev != next)) { 4246 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4247 rq->nr_switches++;
3996 rq->curr = next; 4248 rq->curr = next;
3997 ++*switch_count; 4249 ++*switch_count;
@@ -4010,9 +4262,6 @@ need_resched_nonpreemptible:
4010 4262
4011 post_schedule(rq); 4263 post_schedule(rq);
4012 4264
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched(); 4265 preempt_enable_no_resched();
4017 if (need_resched()) 4266 if (need_resched())
4018 goto need_resched; 4267 goto need_resched;
@@ -4020,70 +4269,53 @@ need_resched_nonpreemptible:
4020EXPORT_SYMBOL(schedule); 4269EXPORT_SYMBOL(schedule);
4021 4270
4022#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4271#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4023/*
4024 * Look out! "owner" is an entirely speculative pointer
4025 * access and not reliable.
4026 */
4027int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4028{
4029 unsigned int cpu;
4030 struct rq *rq;
4031 4272
4032 if (!sched_feat(OWNER_SPIN)) 4273static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4033 return 0; 4274{
4275 bool ret = false;
4034 4276
4035#ifdef CONFIG_DEBUG_PAGEALLOC 4277 rcu_read_lock();
4036 /* 4278 if (lock->owner != owner)
4037 * Need to access the cpu field knowing that 4279 goto fail;
4038 * DEBUG_PAGEALLOC could have unmapped it if
4039 * the mutex owner just released it and exited.
4040 */
4041 if (probe_kernel_address(&owner->cpu, cpu))
4042 return 0;
4043#else
4044 cpu = owner->cpu;
4045#endif
4046 4280
4047 /* 4281 /*
4048 * Even if the access succeeded (likely case), 4282 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4049 * the cpu field may no longer be valid. 4283 * lock->owner still matches owner, if that fails, owner might
4284 * point to free()d memory, if it still matches, the rcu_read_lock()
4285 * ensures the memory stays valid.
4050 */ 4286 */
4051 if (cpu >= nr_cpumask_bits) 4287 barrier();
4052 return 0;
4053 4288
4054 /* 4289 ret = owner->on_cpu;
4055 * We need to validate that we can do a 4290fail:
4056 * get_cpu() and that we have the percpu area. 4291 rcu_read_unlock();
4057 */
4058 if (!cpu_online(cpu))
4059 return 0;
4060 4292
4061 rq = cpu_rq(cpu); 4293 return ret;
4294}
4062 4295
4063 for (;;) { 4296/*
4064 /* 4297 * Look out! "owner" is an entirely speculative pointer
4065 * Owner changed, break to re-assess state. 4298 * access and not reliable.
4066 */ 4299 */
4067 if (lock->owner != owner) { 4300int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4068 /* 4301{
4069 * If the lock has switched to a different owner, 4302 if (!sched_feat(OWNER_SPIN))
4070 * we likely have heavy contention. Return 0 to quit 4303 return 0;
4071 * optimistic spinning and not contend further:
4072 */
4073 if (lock->owner)
4074 return 0;
4075 break;
4076 }
4077 4304
4078 /* 4305 while (owner_running(lock, owner)) {
4079 * Is that owner really running on that cpu? 4306 if (need_resched())
4080 */
4081 if (task_thread_info(rq->curr) != owner || need_resched())
4082 return 0; 4307 return 0;
4083 4308
4084 arch_mutex_cpu_relax(); 4309 arch_mutex_cpu_relax();
4085 } 4310 }
4086 4311
4312 /*
4313 * If the owner changed to another task there is likely
4314 * heavy contention, stop spinning.
4315 */
4316 if (lock->owner)
4317 return 0;
4318
4087 return 1; 4319 return 1;
4088} 4320}
4089#endif 4321#endif
@@ -4213,6 +4445,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4213{ 4445{
4214 __wake_up_common(q, mode, 1, 0, key); 4446 __wake_up_common(q, mode, 1, 0, key);
4215} 4447}
4448EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4216 4449
4217/** 4450/**
4218 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4451 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4542,19 +4775,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4542 */ 4775 */
4543void rt_mutex_setprio(struct task_struct *p, int prio) 4776void rt_mutex_setprio(struct task_struct *p, int prio)
4544{ 4777{
4545 unsigned long flags;
4546 int oldprio, on_rq, running; 4778 int oldprio, on_rq, running;
4547 struct rq *rq; 4779 struct rq *rq;
4548 const struct sched_class *prev_class; 4780 const struct sched_class *prev_class;
4549 4781
4550 BUG_ON(prio < 0 || prio > MAX_PRIO); 4782 BUG_ON(prio < 0 || prio > MAX_PRIO);
4551 4783
4552 rq = task_rq_lock(p, &flags); 4784 rq = __task_rq_lock(p);
4553 4785
4554 trace_sched_pi_setprio(p, prio); 4786 trace_sched_pi_setprio(p, prio);
4555 oldprio = p->prio; 4787 oldprio = p->prio;
4556 prev_class = p->sched_class; 4788 prev_class = p->sched_class;
4557 on_rq = p->se.on_rq; 4789 on_rq = p->on_rq;
4558 running = task_current(rq, p); 4790 running = task_current(rq, p);
4559 if (on_rq) 4791 if (on_rq)
4560 dequeue_task(rq, p, 0); 4792 dequeue_task(rq, p, 0);
@@ -4570,12 +4802,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4570 4802
4571 if (running) 4803 if (running)
4572 p->sched_class->set_curr_task(rq); 4804 p->sched_class->set_curr_task(rq);
4573 if (on_rq) { 4805 if (on_rq)
4574 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4806 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4575 4807
4576 check_class_changed(rq, p, prev_class, oldprio, running); 4808 check_class_changed(rq, p, prev_class, oldprio);
4577 } 4809 __task_rq_unlock(rq);
4578 task_rq_unlock(rq, &flags);
4579} 4810}
4580 4811
4581#endif 4812#endif
@@ -4603,7 +4834,7 @@ void set_user_nice(struct task_struct *p, long nice)
4603 p->static_prio = NICE_TO_PRIO(nice); 4834 p->static_prio = NICE_TO_PRIO(nice);
4604 goto out_unlock; 4835 goto out_unlock;
4605 } 4836 }
4606 on_rq = p->se.on_rq; 4837 on_rq = p->on_rq;
4607 if (on_rq) 4838 if (on_rq)
4608 dequeue_task(rq, p, 0); 4839 dequeue_task(rq, p, 0);
4609 4840
@@ -4623,7 +4854,7 @@ void set_user_nice(struct task_struct *p, long nice)
4623 resched_task(rq->curr); 4854 resched_task(rq->curr);
4624 } 4855 }
4625out_unlock: 4856out_unlock:
4626 task_rq_unlock(rq, &flags); 4857 task_rq_unlock(rq, p, &flags);
4627} 4858}
4628EXPORT_SYMBOL(set_user_nice); 4859EXPORT_SYMBOL(set_user_nice);
4629 4860
@@ -4737,8 +4968,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4737static void 4968static void
4738__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4969__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4739{ 4970{
4740 BUG_ON(p->se.on_rq);
4741
4742 p->policy = policy; 4971 p->policy = policy;
4743 p->rt_priority = prio; 4972 p->rt_priority = prio;
4744 p->normal_prio = normal_prio(p); 4973 p->normal_prio = normal_prio(p);
@@ -4761,8 +4990,11 @@ static bool check_same_owner(struct task_struct *p)
4761 4990
4762 rcu_read_lock(); 4991 rcu_read_lock();
4763 pcred = __task_cred(p); 4992 pcred = __task_cred(p);
4764 match = (cred->euid == pcred->euid || 4993 if (cred->user->user_ns == pcred->user->user_ns)
4765 cred->euid == pcred->uid); 4994 match = (cred->euid == pcred->euid ||
4995 cred->euid == pcred->uid);
4996 else
4997 match = false;
4766 rcu_read_unlock(); 4998 rcu_read_unlock();
4767 return match; 4999 return match;
4768} 5000}
@@ -4822,12 +5054,15 @@ recheck:
4822 param->sched_priority > rlim_rtprio) 5054 param->sched_priority > rlim_rtprio)
4823 return -EPERM; 5055 return -EPERM;
4824 } 5056 }
5057
4825 /* 5058 /*
4826 * Like positive nice levels, dont allow tasks to 5059 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4827 * move out of SCHED_IDLE either: 5060 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4828 */ 5061 */
4829 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 5062 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4830 return -EPERM; 5063 if (!can_nice(p, TASK_NICE(p)))
5064 return -EPERM;
5065 }
4831 5066
4832 /* can't change other user's priorities */ 5067 /* can't change other user's priorities */
4833 if (!check_same_owner(p)) 5068 if (!check_same_owner(p))
@@ -4847,21 +5082,29 @@ recheck:
4847 /* 5082 /*
4848 * make sure no PI-waiters arrive (or leave) while we are 5083 * make sure no PI-waiters arrive (or leave) while we are
4849 * changing the priority of the task: 5084 * changing the priority of the task:
4850 */ 5085 *
4851 raw_spin_lock_irqsave(&p->pi_lock, flags); 5086 * To be able to change p->policy safely, the appropriate
4852 /*
4853 * To be able to change p->policy safely, the apropriate
4854 * runqueue lock must be held. 5087 * runqueue lock must be held.
4855 */ 5088 */
4856 rq = __task_rq_lock(p); 5089 rq = task_rq_lock(p, &flags);
4857 5090
4858 /* 5091 /*
4859 * Changing the policy of the stop threads its a very bad idea 5092 * Changing the policy of the stop threads its a very bad idea
4860 */ 5093 */
4861 if (p == rq->stop) { 5094 if (p == rq->stop) {
5095 task_rq_unlock(rq, p, &flags);
5096 return -EINVAL;
5097 }
5098
5099 /*
5100 * If not changing anything there's no need to proceed further:
5101 */
5102 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5103 param->sched_priority == p->rt_priority))) {
5104
4862 __task_rq_unlock(rq); 5105 __task_rq_unlock(rq);
4863 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5106 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4864 return -EINVAL; 5107 return 0;
4865 } 5108 }
4866 5109
4867#ifdef CONFIG_RT_GROUP_SCHED 5110#ifdef CONFIG_RT_GROUP_SCHED
@@ -4873,8 +5116,7 @@ recheck:
4873 if (rt_bandwidth_enabled() && rt_policy(policy) && 5116 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4874 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5117 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) { 5118 !task_group_is_autogroup(task_group(p))) {
4876 __task_rq_unlock(rq); 5119 task_rq_unlock(rq, p, &flags);
4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4878 return -EPERM; 5120 return -EPERM;
4879 } 5121 }
4880 } 5122 }
@@ -4883,11 +5125,10 @@ recheck:
4883 /* recheck policy now with rq lock held */ 5125 /* recheck policy now with rq lock held */
4884 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5126 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4885 policy = oldpolicy = -1; 5127 policy = oldpolicy = -1;
4886 __task_rq_unlock(rq); 5128 task_rq_unlock(rq, p, &flags);
4887 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4888 goto recheck; 5129 goto recheck;
4889 } 5130 }
4890 on_rq = p->se.on_rq; 5131 on_rq = p->on_rq;
4891 running = task_current(rq, p); 5132 running = task_current(rq, p);
4892 if (on_rq) 5133 if (on_rq)
4893 deactivate_task(rq, p, 0); 5134 deactivate_task(rq, p, 0);
@@ -4902,13 +5143,11 @@ recheck:
4902 5143
4903 if (running) 5144 if (running)
4904 p->sched_class->set_curr_task(rq); 5145 p->sched_class->set_curr_task(rq);
4905 if (on_rq) { 5146 if (on_rq)
4906 activate_task(rq, p, 0); 5147 activate_task(rq, p, 0);
4907 5148
4908 check_class_changed(rq, p, prev_class, oldprio, running); 5149 check_class_changed(rq, p, prev_class, oldprio);
4909 } 5150 task_rq_unlock(rq, p, &flags);
4910 __task_rq_unlock(rq);
4911 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4912 5151
4913 rt_mutex_adjust_pi(p); 5152 rt_mutex_adjust_pi(p);
4914 5153
@@ -5088,7 +5327,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5088 goto out_free_cpus_allowed; 5327 goto out_free_cpus_allowed;
5089 } 5328 }
5090 retval = -EPERM; 5329 retval = -EPERM;
5091 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5330 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5092 goto out_unlock; 5331 goto out_unlock;
5093 5332
5094 retval = security_task_setscheduler(p); 5333 retval = security_task_setscheduler(p);
@@ -5159,7 +5398,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5159{ 5398{
5160 struct task_struct *p; 5399 struct task_struct *p;
5161 unsigned long flags; 5400 unsigned long flags;
5162 struct rq *rq;
5163 int retval; 5401 int retval;
5164 5402
5165 get_online_cpus(); 5403 get_online_cpus();
@@ -5174,9 +5412,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5174 if (retval) 5412 if (retval)
5175 goto out_unlock; 5413 goto out_unlock;
5176 5414
5177 rq = task_rq_lock(p, &flags); 5415 raw_spin_lock_irqsave(&p->pi_lock, flags);
5178 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5416 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5179 task_rq_unlock(rq, &flags); 5417 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5180 5418
5181out_unlock: 5419out_unlock:
5182 rcu_read_unlock(); 5420 rcu_read_unlock();
@@ -5323,6 +5561,67 @@ void __sched yield(void)
5323} 5561}
5324EXPORT_SYMBOL(yield); 5562EXPORT_SYMBOL(yield);
5325 5563
5564/**
5565 * yield_to - yield the current processor to another thread in
5566 * your thread group, or accelerate that thread toward the
5567 * processor it's on.
5568 * @p: target task
5569 * @preempt: whether task preemption is allowed or not
5570 *
5571 * It's the caller's job to ensure that the target task struct
5572 * can't go away on us before we can do any checks.
5573 *
5574 * Returns true if we indeed boosted the target task.
5575 */
5576bool __sched yield_to(struct task_struct *p, bool preempt)
5577{
5578 struct task_struct *curr = current;
5579 struct rq *rq, *p_rq;
5580 unsigned long flags;
5581 bool yielded = 0;
5582
5583 local_irq_save(flags);
5584 rq = this_rq();
5585
5586again:
5587 p_rq = task_rq(p);
5588 double_rq_lock(rq, p_rq);
5589 while (task_rq(p) != p_rq) {
5590 double_rq_unlock(rq, p_rq);
5591 goto again;
5592 }
5593
5594 if (!curr->sched_class->yield_to_task)
5595 goto out;
5596
5597 if (curr->sched_class != p->sched_class)
5598 goto out;
5599
5600 if (task_running(p_rq, p) || p->state)
5601 goto out;
5602
5603 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5604 if (yielded) {
5605 schedstat_inc(rq, yld_count);
5606 /*
5607 * Make p's CPU reschedule; pick_next_entity takes care of
5608 * fairness.
5609 */
5610 if (preempt && rq != p_rq)
5611 resched_task(p_rq->curr);
5612 }
5613
5614out:
5615 double_rq_unlock(rq, p_rq);
5616 local_irq_restore(flags);
5617
5618 if (yielded)
5619 schedule();
5620
5621 return yielded;
5622}
5623EXPORT_SYMBOL_GPL(yield_to);
5624
5326/* 5625/*
5327 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5626 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5328 * that process accounting knows that this is a task in IO wait state. 5627 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5632,7 @@ void __sched io_schedule(void)
5333 5632
5334 delayacct_blkio_start(); 5633 delayacct_blkio_start();
5335 atomic_inc(&rq->nr_iowait); 5634 atomic_inc(&rq->nr_iowait);
5635 blk_flush_plug(current);
5336 current->in_iowait = 1; 5636 current->in_iowait = 1;
5337 schedule(); 5637 schedule();
5338 current->in_iowait = 0; 5638 current->in_iowait = 0;
@@ -5348,6 +5648,7 @@ long __sched io_schedule_timeout(long timeout)
5348 5648
5349 delayacct_blkio_start(); 5649 delayacct_blkio_start();
5350 atomic_inc(&rq->nr_iowait); 5650 atomic_inc(&rq->nr_iowait);
5651 blk_flush_plug(current);
5351 current->in_iowait = 1; 5652 current->in_iowait = 1;
5352 ret = schedule_timeout(timeout); 5653 ret = schedule_timeout(timeout);
5353 current->in_iowait = 0; 5654 current->in_iowait = 0;
@@ -5438,7 +5739,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5438 5739
5439 rq = task_rq_lock(p, &flags); 5740 rq = task_rq_lock(p, &flags);
5440 time_slice = p->sched_class->get_rr_interval(rq, p); 5741 time_slice = p->sched_class->get_rr_interval(rq, p);
5441 task_rq_unlock(rq, &flags); 5742 task_rq_unlock(rq, p, &flags);
5442 5743
5443 rcu_read_unlock(); 5744 rcu_read_unlock();
5444 jiffies_to_timespec(time_slice, &t); 5745 jiffies_to_timespec(time_slice, &t);
@@ -5496,7 +5797,7 @@ void show_state_filter(unsigned long state_filter)
5496 do_each_thread(g, p) { 5797 do_each_thread(g, p) {
5497 /* 5798 /*
5498 * reset the NMI-timeout, listing all files on a slow 5799 * reset the NMI-timeout, listing all files on a slow
5499 * console might take alot of time: 5800 * console might take a lot of time:
5500 */ 5801 */
5501 touch_nmi_watchdog(); 5802 touch_nmi_watchdog();
5502 if (!state_filter || (p->state & state_filter)) 5803 if (!state_filter || (p->state & state_filter))
@@ -5556,22 +5857,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5556 rcu_read_unlock(); 5857 rcu_read_unlock();
5557 5858
5558 rq->curr = rq->idle = idle; 5859 rq->curr = rq->idle = idle;
5559#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5860#if defined(CONFIG_SMP)
5560 idle->oncpu = 1; 5861 idle->on_cpu = 1;
5561#endif 5862#endif
5562 raw_spin_unlock_irqrestore(&rq->lock, flags); 5863 raw_spin_unlock_irqrestore(&rq->lock, flags);
5563 5864
5564 /* Set the preempt count _outside_ the spinlocks! */ 5865 /* Set the preempt count _outside_ the spinlocks! */
5565#if defined(CONFIG_PREEMPT)
5566 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5567#else
5568 task_thread_info(idle)->preempt_count = 0; 5866 task_thread_info(idle)->preempt_count = 0;
5569#endif 5867
5570 /* 5868 /*
5571 * The idle tasks have their own, simple scheduling class: 5869 * The idle tasks have their own, simple scheduling class:
5572 */ 5870 */
5573 idle->sched_class = &idle_sched_class; 5871 idle->sched_class = &idle_sched_class;
5574 ftrace_graph_init_task(idle); 5872 ftrace_graph_init_idle_task(idle, cpu);
5575} 5873}
5576 5874
5577/* 5875/*
@@ -5661,26 +5959,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5661 unsigned int dest_cpu; 5959 unsigned int dest_cpu;
5662 int ret = 0; 5960 int ret = 0;
5663 5961
5664 /*
5665 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5666 * drop the rq->lock and still rely on ->cpus_allowed.
5667 */
5668again:
5669 while (task_is_waking(p))
5670 cpu_relax();
5671 rq = task_rq_lock(p, &flags); 5962 rq = task_rq_lock(p, &flags);
5672 if (task_is_waking(p)) { 5963
5673 task_rq_unlock(rq, &flags); 5964 if (cpumask_equal(&p->cpus_allowed, new_mask))
5674 goto again; 5965 goto out;
5675 }
5676 5966
5677 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5967 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5678 ret = -EINVAL; 5968 ret = -EINVAL;
5679 goto out; 5969 goto out;
5680 } 5970 }
5681 5971
5682 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5972 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5683 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5684 ret = -EINVAL; 5973 ret = -EINVAL;
5685 goto out; 5974 goto out;
5686 } 5975 }
@@ -5697,16 +5986,16 @@ again:
5697 goto out; 5986 goto out;
5698 5987
5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5988 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5700 if (migrate_task(p, rq)) { 5989 if (p->on_rq) {
5701 struct migration_arg arg = { p, dest_cpu }; 5990 struct migration_arg arg = { p, dest_cpu };
5702 /* Need help from migration thread: drop lock and wait. */ 5991 /* Need help from migration thread: drop lock and wait. */
5703 task_rq_unlock(rq, &flags); 5992 task_rq_unlock(rq, p, &flags);
5704 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5993 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5705 tlb_migrate_finish(p->mm); 5994 tlb_migrate_finish(p->mm);
5706 return 0; 5995 return 0;
5707 } 5996 }
5708out: 5997out:
5709 task_rq_unlock(rq, &flags); 5998 task_rq_unlock(rq, p, &flags);
5710 5999
5711 return ret; 6000 return ret;
5712} 6001}
@@ -5734,6 +6023,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5734 rq_src = cpu_rq(src_cpu); 6023 rq_src = cpu_rq(src_cpu);
5735 rq_dest = cpu_rq(dest_cpu); 6024 rq_dest = cpu_rq(dest_cpu);
5736 6025
6026 raw_spin_lock(&p->pi_lock);
5737 double_rq_lock(rq_src, rq_dest); 6027 double_rq_lock(rq_src, rq_dest);
5738 /* Already moved. */ 6028 /* Already moved. */
5739 if (task_cpu(p) != src_cpu) 6029 if (task_cpu(p) != src_cpu)
@@ -5746,7 +6036,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5746 * If we're not on a rq, the next wake-up will ensure we're 6036 * If we're not on a rq, the next wake-up will ensure we're
5747 * placed properly. 6037 * placed properly.
5748 */ 6038 */
5749 if (p->se.on_rq) { 6039 if (p->on_rq) {
5750 deactivate_task(rq_src, p, 0); 6040 deactivate_task(rq_src, p, 0);
5751 set_task_cpu(p, dest_cpu); 6041 set_task_cpu(p, dest_cpu);
5752 activate_task(rq_dest, p, 0); 6042 activate_task(rq_dest, p, 0);
@@ -5756,6 +6046,7 @@ done:
5756 ret = 1; 6046 ret = 1;
5757fail: 6047fail:
5758 double_rq_unlock(rq_src, rq_dest); 6048 double_rq_unlock(rq_src, rq_dest);
6049 raw_spin_unlock(&p->pi_lock);
5759 return ret; 6050 return ret;
5760} 6051}
5761 6052
@@ -6096,6 +6387,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6096 6387
6097#ifdef CONFIG_HOTPLUG_CPU 6388#ifdef CONFIG_HOTPLUG_CPU
6098 case CPU_DYING: 6389 case CPU_DYING:
6390 sched_ttwu_pending();
6099 /* Update our root-domain */ 6391 /* Update our root-domain */
6100 raw_spin_lock_irqsave(&rq->lock, flags); 6392 raw_spin_lock_irqsave(&rq->lock, flags);
6101 if (rq->rd) { 6393 if (rq->rd) {
@@ -6111,6 +6403,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6111 break; 6403 break;
6112#endif 6404#endif
6113 } 6405 }
6406
6407 update_max_interval();
6408
6114 return NOTIFY_OK; 6409 return NOTIFY_OK;
6115} 6410}
6116 6411
@@ -6171,6 +6466,8 @@ early_initcall(migration_init);
6171 6466
6172#ifdef CONFIG_SMP 6467#ifdef CONFIG_SMP
6173 6468
6469static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6470
6174#ifdef CONFIG_SCHED_DEBUG 6471#ifdef CONFIG_SCHED_DEBUG
6175 6472
6176static __read_mostly int sched_domain_debug_enabled; 6473static __read_mostly int sched_domain_debug_enabled;
@@ -6245,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6245 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6542 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6246 6543
6247 printk(KERN_CONT " %s", str); 6544 printk(KERN_CONT " %s", str);
6248 if (group->cpu_power != SCHED_LOAD_SCALE) { 6545 if (group->cpu_power != SCHED_POWER_SCALE) {
6249 printk(KERN_CONT " (cpu_power = %d)", 6546 printk(KERN_CONT " (cpu_power = %d)",
6250 group->cpu_power); 6547 group->cpu_power);
6251 } 6548 }
@@ -6266,7 +6563,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6266 6563
6267static void sched_domain_debug(struct sched_domain *sd, int cpu) 6564static void sched_domain_debug(struct sched_domain *sd, int cpu)
6268{ 6565{
6269 cpumask_var_t groupmask;
6270 int level = 0; 6566 int level = 0;
6271 6567
6272 if (!sched_domain_debug_enabled) 6568 if (!sched_domain_debug_enabled)
@@ -6279,20 +6575,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6279 6575
6280 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6576 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6281 6577
6282 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6283 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6284 return;
6285 }
6286
6287 for (;;) { 6578 for (;;) {
6288 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6579 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6289 break; 6580 break;
6290 level++; 6581 level++;
6291 sd = sd->parent; 6582 sd = sd->parent;
6292 if (!sd) 6583 if (!sd)
6293 break; 6584 break;
6294 } 6585 }
6295 free_cpumask_var(groupmask);
6296} 6586}
6297#else /* !CONFIG_SCHED_DEBUG */ 6587#else /* !CONFIG_SCHED_DEBUG */
6298# define sched_domain_debug(sd, cpu) do { } while (0) 6588# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6349,12 +6639,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6349 return 1; 6639 return 1;
6350} 6640}
6351 6641
6352static void free_rootdomain(struct root_domain *rd) 6642static void free_rootdomain(struct rcu_head *rcu)
6353{ 6643{
6354 synchronize_sched(); 6644 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6355 6645
6356 cpupri_cleanup(&rd->cpupri); 6646 cpupri_cleanup(&rd->cpupri);
6357
6358 free_cpumask_var(rd->rto_mask); 6647 free_cpumask_var(rd->rto_mask);
6359 free_cpumask_var(rd->online); 6648 free_cpumask_var(rd->online);
6360 free_cpumask_var(rd->span); 6649 free_cpumask_var(rd->span);
@@ -6395,7 +6684,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6395 raw_spin_unlock_irqrestore(&rq->lock, flags); 6684 raw_spin_unlock_irqrestore(&rq->lock, flags);
6396 6685
6397 if (old_rd) 6686 if (old_rd)
6398 free_rootdomain(old_rd); 6687 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6399} 6688}
6400 6689
6401static int init_rootdomain(struct root_domain *rd) 6690static int init_rootdomain(struct root_domain *rd)
@@ -6446,6 +6735,25 @@ static struct root_domain *alloc_rootdomain(void)
6446 return rd; 6735 return rd;
6447} 6736}
6448 6737
6738static void free_sched_domain(struct rcu_head *rcu)
6739{
6740 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6741 if (atomic_dec_and_test(&sd->groups->ref))
6742 kfree(sd->groups);
6743 kfree(sd);
6744}
6745
6746static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6747{
6748 call_rcu(&sd->rcu, free_sched_domain);
6749}
6750
6751static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6752{
6753 for (; sd; sd = sd->parent)
6754 destroy_sched_domain(sd, cpu);
6755}
6756
6449/* 6757/*
6450 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6758 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6451 * hold the hotplug lock. 6759 * hold the hotplug lock.
@@ -6456,9 +6764,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6456 struct rq *rq = cpu_rq(cpu); 6764 struct rq *rq = cpu_rq(cpu);
6457 struct sched_domain *tmp; 6765 struct sched_domain *tmp;
6458 6766
6459 for (tmp = sd; tmp; tmp = tmp->parent)
6460 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6461
6462 /* Remove the sched domains which do not contribute to scheduling. */ 6767 /* Remove the sched domains which do not contribute to scheduling. */
6463 for (tmp = sd; tmp; ) { 6768 for (tmp = sd; tmp; ) {
6464 struct sched_domain *parent = tmp->parent; 6769 struct sched_domain *parent = tmp->parent;
@@ -6469,12 +6774,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6469 tmp->parent = parent->parent; 6774 tmp->parent = parent->parent;
6470 if (parent->parent) 6775 if (parent->parent)
6471 parent->parent->child = tmp; 6776 parent->parent->child = tmp;
6777 destroy_sched_domain(parent, cpu);
6472 } else 6778 } else
6473 tmp = tmp->parent; 6779 tmp = tmp->parent;
6474 } 6780 }
6475 6781
6476 if (sd && sd_degenerate(sd)) { 6782 if (sd && sd_degenerate(sd)) {
6783 tmp = sd;
6477 sd = sd->parent; 6784 sd = sd->parent;
6785 destroy_sched_domain(tmp, cpu);
6478 if (sd) 6786 if (sd)
6479 sd->child = NULL; 6787 sd->child = NULL;
6480 } 6788 }
@@ -6482,7 +6790,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6482 sched_domain_debug(sd, cpu); 6790 sched_domain_debug(sd, cpu);
6483 6791
6484 rq_attach_root(rq, rd); 6792 rq_attach_root(rq, rd);
6793 tmp = rq->sd;
6485 rcu_assign_pointer(rq->sd, sd); 6794 rcu_assign_pointer(rq->sd, sd);
6795 destroy_sched_domains(tmp, cpu);
6486} 6796}
6487 6797
6488/* cpus with isolated domains */ 6798/* cpus with isolated domains */
@@ -6498,56 +6808,6 @@ static int __init isolated_cpu_setup(char *str)
6498 6808
6499__setup("isolcpus=", isolated_cpu_setup); 6809__setup("isolcpus=", isolated_cpu_setup);
6500 6810
6501/*
6502 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6503 * to a function which identifies what group(along with sched group) a CPU
6504 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6505 * (due to the fact that we keep track of groups covered with a struct cpumask).
6506 *
6507 * init_sched_build_groups will build a circular linked list of the groups
6508 * covered by the given span, and will set each group's ->cpumask correctly,
6509 * and ->cpu_power to 0.
6510 */
6511static void
6512init_sched_build_groups(const struct cpumask *span,
6513 const struct cpumask *cpu_map,
6514 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6515 struct sched_group **sg,
6516 struct cpumask *tmpmask),
6517 struct cpumask *covered, struct cpumask *tmpmask)
6518{
6519 struct sched_group *first = NULL, *last = NULL;
6520 int i;
6521
6522 cpumask_clear(covered);
6523
6524 for_each_cpu(i, span) {
6525 struct sched_group *sg;
6526 int group = group_fn(i, cpu_map, &sg, tmpmask);
6527 int j;
6528
6529 if (cpumask_test_cpu(i, covered))
6530 continue;
6531
6532 cpumask_clear(sched_group_cpus(sg));
6533 sg->cpu_power = 0;
6534
6535 for_each_cpu(j, span) {
6536 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6537 continue;
6538
6539 cpumask_set_cpu(j, covered);
6540 cpumask_set_cpu(j, sched_group_cpus(sg));
6541 }
6542 if (!first)
6543 first = sg;
6544 if (last)
6545 last->next = sg;
6546 last = sg;
6547 }
6548 last->next = first;
6549}
6550
6551#define SD_NODES_PER_DOMAIN 16 6811#define SD_NODES_PER_DOMAIN 16
6552 6812
6553#ifdef CONFIG_NUMA 6813#ifdef CONFIG_NUMA
@@ -6564,7 +6824,7 @@ init_sched_build_groups(const struct cpumask *span,
6564 */ 6824 */
6565static int find_next_best_node(int node, nodemask_t *used_nodes) 6825static int find_next_best_node(int node, nodemask_t *used_nodes)
6566{ 6826{
6567 int i, n, val, min_val, best_node = 0; 6827 int i, n, val, min_val, best_node = -1;
6568 6828
6569 min_val = INT_MAX; 6829 min_val = INT_MAX;
6570 6830
@@ -6588,7 +6848,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6588 } 6848 }
6589 } 6849 }
6590 6850
6591 node_set(best_node, *used_nodes); 6851 if (best_node != -1)
6852 node_set(best_node, *used_nodes);
6592 return best_node; 6853 return best_node;
6593} 6854}
6594 6855
@@ -6614,315 +6875,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6614 6875
6615 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6876 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6616 int next_node = find_next_best_node(node, &used_nodes); 6877 int next_node = find_next_best_node(node, &used_nodes);
6617 6878 if (next_node < 0)
6879 break;
6618 cpumask_or(span, span, cpumask_of_node(next_node)); 6880 cpumask_or(span, span, cpumask_of_node(next_node));
6619 } 6881 }
6620} 6882}
6883
6884static const struct cpumask *cpu_node_mask(int cpu)
6885{
6886 lockdep_assert_held(&sched_domains_mutex);
6887
6888 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6889
6890 return sched_domains_tmpmask;
6891}
6892
6893static const struct cpumask *cpu_allnodes_mask(int cpu)
6894{
6895 return cpu_possible_mask;
6896}
6621#endif /* CONFIG_NUMA */ 6897#endif /* CONFIG_NUMA */
6622 6898
6623int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6899static const struct cpumask *cpu_cpu_mask(int cpu)
6900{
6901 return cpumask_of_node(cpu_to_node(cpu));
6902}
6624 6903
6625/* 6904int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6626 * The cpus mask in sched_group and sched_domain hangs off the end.
6627 *
6628 * ( See the the comments in include/linux/sched.h:struct sched_group
6629 * and struct sched_domain. )
6630 */
6631struct static_sched_group {
6632 struct sched_group sg;
6633 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6634};
6635 6905
6636struct static_sched_domain { 6906struct sd_data {
6637 struct sched_domain sd; 6907 struct sched_domain **__percpu sd;
6638 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6908 struct sched_group **__percpu sg;
6639}; 6909};
6640 6910
6641struct s_data { 6911struct s_data {
6642#ifdef CONFIG_NUMA 6912 struct sched_domain ** __percpu sd;
6643 int sd_allnodes;
6644 cpumask_var_t domainspan;
6645 cpumask_var_t covered;
6646 cpumask_var_t notcovered;
6647#endif
6648 cpumask_var_t nodemask;
6649 cpumask_var_t this_sibling_map;
6650 cpumask_var_t this_core_map;
6651 cpumask_var_t this_book_map;
6652 cpumask_var_t send_covered;
6653 cpumask_var_t tmpmask;
6654 struct sched_group **sched_group_nodes;
6655 struct root_domain *rd; 6913 struct root_domain *rd;
6656}; 6914};
6657 6915
6658enum s_alloc { 6916enum s_alloc {
6659 sa_sched_groups = 0,
6660 sa_rootdomain, 6917 sa_rootdomain,
6661 sa_tmpmask, 6918 sa_sd,
6662 sa_send_covered, 6919 sa_sd_storage,
6663 sa_this_book_map,
6664 sa_this_core_map,
6665 sa_this_sibling_map,
6666 sa_nodemask,
6667 sa_sched_group_nodes,
6668#ifdef CONFIG_NUMA
6669 sa_notcovered,
6670 sa_covered,
6671 sa_domainspan,
6672#endif
6673 sa_none, 6920 sa_none,
6674}; 6921};
6675 6922
6676/* 6923struct sched_domain_topology_level;
6677 * SMT sched-domains:
6678 */
6679#ifdef CONFIG_SCHED_SMT
6680static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6681static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6682 6924
6683static int 6925typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6684cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6926typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6685 struct sched_group **sg, struct cpumask *unused)
6686{
6687 if (sg)
6688 *sg = &per_cpu(sched_groups, cpu).sg;
6689 return cpu;
6690}
6691#endif /* CONFIG_SCHED_SMT */
6692 6927
6693/* 6928struct sched_domain_topology_level {
6694 * multi-core sched-domains: 6929 sched_domain_init_f init;
6695 */ 6930 sched_domain_mask_f mask;
6696#ifdef CONFIG_SCHED_MC 6931 struct sd_data data;
6697static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6932};
6698static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6699
6700static int
6701cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6702 struct sched_group **sg, struct cpumask *mask)
6703{
6704 int group;
6705#ifdef CONFIG_SCHED_SMT
6706 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6707 group = cpumask_first(mask);
6708#else
6709 group = cpu;
6710#endif
6711 if (sg)
6712 *sg = &per_cpu(sched_group_core, group).sg;
6713 return group;
6714}
6715#endif /* CONFIG_SCHED_MC */
6716 6933
6717/* 6934/*
6718 * book sched-domains: 6935 * Assumes the sched_domain tree is fully constructed
6719 */ 6936 */
6720#ifdef CONFIG_SCHED_BOOK 6937static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6721static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6722static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6723
6724static int
6725cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6726 struct sched_group **sg, struct cpumask *mask)
6727{ 6938{
6728 int group = cpu; 6939 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6729#ifdef CONFIG_SCHED_MC 6940 struct sched_domain *child = sd->child;
6730 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6731 group = cpumask_first(mask);
6732#elif defined(CONFIG_SCHED_SMT)
6733 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6734 group = cpumask_first(mask);
6735#endif
6736 if (sg)
6737 *sg = &per_cpu(sched_group_book, group).sg;
6738 return group;
6739}
6740#endif /* CONFIG_SCHED_BOOK */
6741 6941
6742static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6942 if (child)
6743static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6943 cpu = cpumask_first(sched_domain_span(child));
6744 6944
6745static int
6746cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6747 struct sched_group **sg, struct cpumask *mask)
6748{
6749 int group;
6750#ifdef CONFIG_SCHED_BOOK
6751 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6752 group = cpumask_first(mask);
6753#elif defined(CONFIG_SCHED_MC)
6754 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6755 group = cpumask_first(mask);
6756#elif defined(CONFIG_SCHED_SMT)
6757 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6758 group = cpumask_first(mask);
6759#else
6760 group = cpu;
6761#endif
6762 if (sg) 6945 if (sg)
6763 *sg = &per_cpu(sched_group_phys, group).sg; 6946 *sg = *per_cpu_ptr(sdd->sg, cpu);
6764 return group; 6947
6948 return cpu;
6765} 6949}
6766 6950
6767#ifdef CONFIG_NUMA
6768/* 6951/*
6769 * The init_sched_build_groups can't handle what we want to do with node 6952 * build_sched_groups takes the cpumask we wish to span, and a pointer
6770 * groups, so roll our own. Now each node has its own list of groups which 6953 * to a function which identifies what group(along with sched group) a CPU
6771 * gets dynamically allocated. 6954 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6955 * (due to the fact that we keep track of groups covered with a struct cpumask).
6956 *
6957 * build_sched_groups will build a circular linked list of the groups
6958 * covered by the given span, and will set each group's ->cpumask correctly,
6959 * and ->cpu_power to 0.
6772 */ 6960 */
6773static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6961static void
6774static struct sched_group ***sched_group_nodes_bycpu; 6962build_sched_groups(struct sched_domain *sd)
6775
6776static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6777static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
6778
6779static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
6780 struct sched_group **sg,
6781 struct cpumask *nodemask)
6782{
6783 int group;
6784
6785 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
6786 group = cpumask_first(nodemask);
6787
6788 if (sg)
6789 *sg = &per_cpu(sched_group_allnodes, group).sg;
6790 return group;
6791}
6792
6793static void init_numa_sched_groups_power(struct sched_group *group_head)
6794{
6795 struct sched_group *sg = group_head;
6796 int j;
6797
6798 if (!sg)
6799 return;
6800 do {
6801 for_each_cpu(j, sched_group_cpus(sg)) {
6802 struct sched_domain *sd;
6803
6804 sd = &per_cpu(phys_domains, j).sd;
6805 if (j != group_first_cpu(sd->groups)) {
6806 /*
6807 * Only add "power" once for each
6808 * physical package.
6809 */
6810 continue;
6811 }
6812
6813 sg->cpu_power += sd->groups->cpu_power;
6814 }
6815 sg = sg->next;
6816 } while (sg != group_head);
6817}
6818
6819static int build_numa_sched_groups(struct s_data *d,
6820 const struct cpumask *cpu_map, int num)
6821{ 6963{
6822 struct sched_domain *sd; 6964 struct sched_group *first = NULL, *last = NULL;
6823 struct sched_group *sg, *prev; 6965 struct sd_data *sdd = sd->private;
6824 int n, j; 6966 const struct cpumask *span = sched_domain_span(sd);
6825 6967 struct cpumask *covered;
6826 cpumask_clear(d->covered); 6968 int i;
6827 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
6828 if (cpumask_empty(d->nodemask)) {
6829 d->sched_group_nodes[num] = NULL;
6830 goto out;
6831 }
6832
6833 sched_domain_node_span(num, d->domainspan);
6834 cpumask_and(d->domainspan, d->domainspan, cpu_map);
6835
6836 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6837 GFP_KERNEL, num);
6838 if (!sg) {
6839 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
6840 num);
6841 return -ENOMEM;
6842 }
6843 d->sched_group_nodes[num] = sg;
6844
6845 for_each_cpu(j, d->nodemask) {
6846 sd = &per_cpu(node_domains, j).sd;
6847 sd->groups = sg;
6848 }
6849
6850 sg->cpu_power = 0;
6851 cpumask_copy(sched_group_cpus(sg), d->nodemask);
6852 sg->next = sg;
6853 cpumask_or(d->covered, d->covered, d->nodemask);
6854 6969
6855 prev = sg; 6970 lockdep_assert_held(&sched_domains_mutex);
6856 for (j = 0; j < nr_node_ids; j++) { 6971 covered = sched_domains_tmpmask;
6857 n = (num + j) % nr_node_ids;
6858 cpumask_complement(d->notcovered, d->covered);
6859 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6860 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6861 if (cpumask_empty(d->tmpmask))
6862 break;
6863 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6864 if (cpumask_empty(d->tmpmask))
6865 continue;
6866 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6867 GFP_KERNEL, num);
6868 if (!sg) {
6869 printk(KERN_WARNING
6870 "Can not alloc domain group for node %d\n", j);
6871 return -ENOMEM;
6872 }
6873 sg->cpu_power = 0;
6874 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6875 sg->next = prev->next;
6876 cpumask_or(d->covered, d->covered, d->tmpmask);
6877 prev->next = sg;
6878 prev = sg;
6879 }
6880out:
6881 return 0;
6882}
6883#endif /* CONFIG_NUMA */
6884 6972
6885#ifdef CONFIG_NUMA 6973 cpumask_clear(covered);
6886/* Free memory allocated for various sched_group structures */
6887static void free_sched_groups(const struct cpumask *cpu_map,
6888 struct cpumask *nodemask)
6889{
6890 int cpu, i;
6891 6974
6892 for_each_cpu(cpu, cpu_map) { 6975 for_each_cpu(i, span) {
6893 struct sched_group **sched_group_nodes 6976 struct sched_group *sg;
6894 = sched_group_nodes_bycpu[cpu]; 6977 int group = get_group(i, sdd, &sg);
6978 int j;
6895 6979
6896 if (!sched_group_nodes) 6980 if (cpumask_test_cpu(i, covered))
6897 continue; 6981 continue;
6898 6982
6899 for (i = 0; i < nr_node_ids; i++) { 6983 cpumask_clear(sched_group_cpus(sg));
6900 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6984 sg->cpu_power = 0;
6901 6985
6902 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6986 for_each_cpu(j, span) {
6903 if (cpumask_empty(nodemask)) 6987 if (get_group(j, sdd, NULL) != group)
6904 continue; 6988 continue;
6905 6989
6906 if (sg == NULL) 6990 cpumask_set_cpu(j, covered);
6907 continue; 6991 cpumask_set_cpu(j, sched_group_cpus(sg));
6908 sg = sg->next;
6909next_sg:
6910 oldsg = sg;
6911 sg = sg->next;
6912 kfree(oldsg);
6913 if (oldsg != sched_group_nodes[i])
6914 goto next_sg;
6915 } 6992 }
6916 kfree(sched_group_nodes); 6993
6917 sched_group_nodes_bycpu[cpu] = NULL; 6994 if (!first)
6995 first = sg;
6996 if (last)
6997 last->next = sg;
6998 last = sg;
6918 } 6999 }
7000 last->next = first;
6919} 7001}
6920#else /* !CONFIG_NUMA */
6921static void free_sched_groups(const struct cpumask *cpu_map,
6922 struct cpumask *nodemask)
6923{
6924}
6925#endif /* CONFIG_NUMA */
6926 7002
6927/* 7003/*
6928 * Initialize sched groups cpu_power. 7004 * Initialize sched groups cpu_power.
@@ -6936,11 +7012,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
6936 */ 7012 */
6937static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7013static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6938{ 7014{
6939 struct sched_domain *child;
6940 struct sched_group *group;
6941 long power;
6942 int weight;
6943
6944 WARN_ON(!sd || !sd->groups); 7015 WARN_ON(!sd || !sd->groups);
6945 7016
6946 if (cpu != group_first_cpu(sd->groups)) 7017 if (cpu != group_first_cpu(sd->groups))
@@ -6948,36 +7019,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6948 7019
6949 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7020 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6950 7021
6951 child = sd->child; 7022 update_group_power(sd, cpu);
6952
6953 sd->groups->cpu_power = 0;
6954
6955 if (!child) {
6956 power = SCHED_LOAD_SCALE;
6957 weight = cpumask_weight(sched_domain_span(sd));
6958 /*
6959 * SMT siblings share the power of a single core.
6960 * Usually multiple threads get a better yield out of
6961 * that one core than a single thread would have,
6962 * reflect that in sd->smt_gain.
6963 */
6964 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6965 power *= sd->smt_gain;
6966 power /= weight;
6967 power >>= SCHED_LOAD_SHIFT;
6968 }
6969 sd->groups->cpu_power += power;
6970 return;
6971 }
6972
6973 /*
6974 * Add cpu_power of each child group to this groups cpu_power.
6975 */
6976 group = child->groups;
6977 do {
6978 sd->groups->cpu_power += group->cpu_power;
6979 group = group->next;
6980 } while (group != child->groups);
6981} 7023}
6982 7024
6983/* 7025/*
@@ -6991,15 +7033,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6991# define SD_INIT_NAME(sd, type) do { } while (0) 7033# define SD_INIT_NAME(sd, type) do { } while (0)
6992#endif 7034#endif
6993 7035
6994#define SD_INIT(sd, type) sd_init_##type(sd) 7036#define SD_INIT_FUNC(type) \
6995 7037static noinline struct sched_domain * \
6996#define SD_INIT_FUNC(type) \ 7038sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6997static noinline void sd_init_##type(struct sched_domain *sd) \ 7039{ \
6998{ \ 7040 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6999 memset(sd, 0, sizeof(*sd)); \ 7041 *sd = SD_##type##_INIT; \
7000 *sd = SD_##type##_INIT; \ 7042 SD_INIT_NAME(sd, type); \
7001 sd->level = SD_LV_##type; \ 7043 sd->private = &tl->data; \
7002 SD_INIT_NAME(sd, type); \ 7044 return sd; \
7003} 7045}
7004 7046
7005SD_INIT_FUNC(CPU) 7047SD_INIT_FUNC(CPU)
@@ -7018,13 +7060,14 @@ SD_INIT_FUNC(CPU)
7018#endif 7060#endif
7019 7061
7020static int default_relax_domain_level = -1; 7062static int default_relax_domain_level = -1;
7063int sched_domain_level_max;
7021 7064
7022static int __init setup_relax_domain_level(char *str) 7065static int __init setup_relax_domain_level(char *str)
7023{ 7066{
7024 unsigned long val; 7067 unsigned long val;
7025 7068
7026 val = simple_strtoul(str, NULL, 0); 7069 val = simple_strtoul(str, NULL, 0);
7027 if (val < SD_LV_MAX) 7070 if (val < sched_domain_level_max)
7028 default_relax_domain_level = val; 7071 default_relax_domain_level = val;
7029 7072
7030 return 1; 7073 return 1;
@@ -7052,37 +7095,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7052 } 7095 }
7053} 7096}
7054 7097
7098static void __sdt_free(const struct cpumask *cpu_map);
7099static int __sdt_alloc(const struct cpumask *cpu_map);
7100
7055static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7101static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7056 const struct cpumask *cpu_map) 7102 const struct cpumask *cpu_map)
7057{ 7103{
7058 switch (what) { 7104 switch (what) {
7059 case sa_sched_groups:
7060 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7061 d->sched_group_nodes = NULL;
7062 case sa_rootdomain: 7105 case sa_rootdomain:
7063 free_rootdomain(d->rd); /* fall through */ 7106 if (!atomic_read(&d->rd->refcount))
7064 case sa_tmpmask: 7107 free_rootdomain(&d->rd->rcu); /* fall through */
7065 free_cpumask_var(d->tmpmask); /* fall through */ 7108 case sa_sd:
7066 case sa_send_covered: 7109 free_percpu(d->sd); /* fall through */
7067 free_cpumask_var(d->send_covered); /* fall through */ 7110 case sa_sd_storage:
7068 case sa_this_book_map: 7111 __sdt_free(cpu_map); /* fall through */
7069 free_cpumask_var(d->this_book_map); /* fall through */
7070 case sa_this_core_map:
7071 free_cpumask_var(d->this_core_map); /* fall through */
7072 case sa_this_sibling_map:
7073 free_cpumask_var(d->this_sibling_map); /* fall through */
7074 case sa_nodemask:
7075 free_cpumask_var(d->nodemask); /* fall through */
7076 case sa_sched_group_nodes:
7077#ifdef CONFIG_NUMA
7078 kfree(d->sched_group_nodes); /* fall through */
7079 case sa_notcovered:
7080 free_cpumask_var(d->notcovered); /* fall through */
7081 case sa_covered:
7082 free_cpumask_var(d->covered); /* fall through */
7083 case sa_domainspan:
7084 free_cpumask_var(d->domainspan); /* fall through */
7085#endif
7086 case sa_none: 7112 case sa_none:
7087 break; 7113 break;
7088 } 7114 }
@@ -7091,308 +7117,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7091static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7117static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7092 const struct cpumask *cpu_map) 7118 const struct cpumask *cpu_map)
7093{ 7119{
7094#ifdef CONFIG_NUMA 7120 memset(d, 0, sizeof(*d));
7095 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7121
7096 return sa_none; 7122 if (__sdt_alloc(cpu_map))
7097 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7123 return sa_sd_storage;
7098 return sa_domainspan; 7124 d->sd = alloc_percpu(struct sched_domain *);
7099 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7125 if (!d->sd)
7100 return sa_covered; 7126 return sa_sd_storage;
7101 /* Allocate the per-node list of sched groups */
7102 d->sched_group_nodes = kcalloc(nr_node_ids,
7103 sizeof(struct sched_group *), GFP_KERNEL);
7104 if (!d->sched_group_nodes) {
7105 printk(KERN_WARNING "Can not alloc sched group node list\n");
7106 return sa_notcovered;
7107 }
7108 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7109#endif
7110 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7111 return sa_sched_group_nodes;
7112 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7113 return sa_nodemask;
7114 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7115 return sa_this_sibling_map;
7116 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7117 return sa_this_core_map;
7118 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7119 return sa_this_book_map;
7120 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7121 return sa_send_covered;
7122 d->rd = alloc_rootdomain(); 7127 d->rd = alloc_rootdomain();
7123 if (!d->rd) { 7128 if (!d->rd)
7124 printk(KERN_WARNING "Cannot alloc root domain\n"); 7129 return sa_sd;
7125 return sa_tmpmask;
7126 }
7127 return sa_rootdomain; 7130 return sa_rootdomain;
7128} 7131}
7129 7132
7130static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7133/*
7131 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7134 * NULL the sd_data elements we've used to build the sched_domain and
7135 * sched_group structure so that the subsequent __free_domain_allocs()
7136 * will not free the data we're using.
7137 */
7138static void claim_allocations(int cpu, struct sched_domain *sd)
7132{ 7139{
7133 struct sched_domain *sd = NULL; 7140 struct sd_data *sdd = sd->private;
7134#ifdef CONFIG_NUMA 7141 struct sched_group *sg = sd->groups;
7135 struct sched_domain *parent;
7136
7137 d->sd_allnodes = 0;
7138 if (cpumask_weight(cpu_map) >
7139 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7140 sd = &per_cpu(allnodes_domains, i).sd;
7141 SD_INIT(sd, ALLNODES);
7142 set_domain_attribute(sd, attr);
7143 cpumask_copy(sched_domain_span(sd), cpu_map);
7144 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7145 d->sd_allnodes = 1;
7146 }
7147 parent = sd;
7148
7149 sd = &per_cpu(node_domains, i).sd;
7150 SD_INIT(sd, NODE);
7151 set_domain_attribute(sd, attr);
7152 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7153 sd->parent = parent;
7154 if (parent)
7155 parent->child = sd;
7156 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7157#endif
7158 return sd;
7159}
7160 7142
7161static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7143 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7162 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7144 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7163 struct sched_domain *parent, int i)
7164{
7165 struct sched_domain *sd;
7166 sd = &per_cpu(phys_domains, i).sd;
7167 SD_INIT(sd, CPU);
7168 set_domain_attribute(sd, attr);
7169 cpumask_copy(sched_domain_span(sd), d->nodemask);
7170 sd->parent = parent;
7171 if (parent)
7172 parent->child = sd;
7173 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7174 return sd;
7175}
7176 7145
7177static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7146 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7178 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7147 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7179 struct sched_domain *parent, int i) 7148 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7180{ 7149 }
7181 struct sched_domain *sd = parent;
7182#ifdef CONFIG_SCHED_BOOK
7183 sd = &per_cpu(book_domains, i).sd;
7184 SD_INIT(sd, BOOK);
7185 set_domain_attribute(sd, attr);
7186 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7187 sd->parent = parent;
7188 parent->child = sd;
7189 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7190#endif
7191 return sd;
7192} 7150}
7193 7151
7194static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7152#ifdef CONFIG_SCHED_SMT
7195 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7153static const struct cpumask *cpu_smt_mask(int cpu)
7196 struct sched_domain *parent, int i)
7197{ 7154{
7198 struct sched_domain *sd = parent; 7155 return topology_thread_cpumask(cpu);
7199#ifdef CONFIG_SCHED_MC
7200 sd = &per_cpu(core_domains, i).sd;
7201 SD_INIT(sd, MC);
7202 set_domain_attribute(sd, attr);
7203 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7204 sd->parent = parent;
7205 parent->child = sd;
7206 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7207#endif
7208 return sd;
7209} 7156}
7210
7211static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7212 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7213 struct sched_domain *parent, int i)
7214{
7215 struct sched_domain *sd = parent;
7216#ifdef CONFIG_SCHED_SMT
7217 sd = &per_cpu(cpu_domains, i).sd;
7218 SD_INIT(sd, SIBLING);
7219 set_domain_attribute(sd, attr);
7220 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7221 sd->parent = parent;
7222 parent->child = sd;
7223 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7224#endif 7157#endif
7225 return sd;
7226}
7227 7158
7228static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7159/*
7229 const struct cpumask *cpu_map, int cpu) 7160 * Topology list, bottom-up.
7230{ 7161 */
7231 switch (l) { 7162static struct sched_domain_topology_level default_topology[] = {
7232#ifdef CONFIG_SCHED_SMT 7163#ifdef CONFIG_SCHED_SMT
7233 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7164 { sd_init_SIBLING, cpu_smt_mask, },
7234 cpumask_and(d->this_sibling_map, cpu_map,
7235 topology_thread_cpumask(cpu));
7236 if (cpu == cpumask_first(d->this_sibling_map))
7237 init_sched_build_groups(d->this_sibling_map, cpu_map,
7238 &cpu_to_cpu_group,
7239 d->send_covered, d->tmpmask);
7240 break;
7241#endif 7165#endif
7242#ifdef CONFIG_SCHED_MC 7166#ifdef CONFIG_SCHED_MC
7243 case SD_LV_MC: /* set up multi-core groups */ 7167 { sd_init_MC, cpu_coregroup_mask, },
7244 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7245 if (cpu == cpumask_first(d->this_core_map))
7246 init_sched_build_groups(d->this_core_map, cpu_map,
7247 &cpu_to_core_group,
7248 d->send_covered, d->tmpmask);
7249 break;
7250#endif 7168#endif
7251#ifdef CONFIG_SCHED_BOOK 7169#ifdef CONFIG_SCHED_BOOK
7252 case SD_LV_BOOK: /* set up book groups */ 7170 { sd_init_BOOK, cpu_book_mask, },
7253 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7254 if (cpu == cpumask_first(d->this_book_map))
7255 init_sched_build_groups(d->this_book_map, cpu_map,
7256 &cpu_to_book_group,
7257 d->send_covered, d->tmpmask);
7258 break;
7259#endif 7171#endif
7260 case SD_LV_CPU: /* set up physical groups */ 7172 { sd_init_CPU, cpu_cpu_mask, },
7261 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7262 if (!cpumask_empty(d->nodemask))
7263 init_sched_build_groups(d->nodemask, cpu_map,
7264 &cpu_to_phys_group,
7265 d->send_covered, d->tmpmask);
7266 break;
7267#ifdef CONFIG_NUMA 7173#ifdef CONFIG_NUMA
7268 case SD_LV_ALLNODES: 7174 { sd_init_NODE, cpu_node_mask, },
7269 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7175 { sd_init_ALLNODES, cpu_allnodes_mask, },
7270 d->send_covered, d->tmpmask);
7271 break;
7272#endif 7176#endif
7273 default: 7177 { NULL, },
7274 break; 7178};
7179
7180static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7181
7182static int __sdt_alloc(const struct cpumask *cpu_map)
7183{
7184 struct sched_domain_topology_level *tl;
7185 int j;
7186
7187 for (tl = sched_domain_topology; tl->init; tl++) {
7188 struct sd_data *sdd = &tl->data;
7189
7190 sdd->sd = alloc_percpu(struct sched_domain *);
7191 if (!sdd->sd)
7192 return -ENOMEM;
7193
7194 sdd->sg = alloc_percpu(struct sched_group *);
7195 if (!sdd->sg)
7196 return -ENOMEM;
7197
7198 for_each_cpu(j, cpu_map) {
7199 struct sched_domain *sd;
7200 struct sched_group *sg;
7201
7202 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7203 GFP_KERNEL, cpu_to_node(j));
7204 if (!sd)
7205 return -ENOMEM;
7206
7207 *per_cpu_ptr(sdd->sd, j) = sd;
7208
7209 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7210 GFP_KERNEL, cpu_to_node(j));
7211 if (!sg)
7212 return -ENOMEM;
7213
7214 *per_cpu_ptr(sdd->sg, j) = sg;
7215 }
7216 }
7217
7218 return 0;
7219}
7220
7221static void __sdt_free(const struct cpumask *cpu_map)
7222{
7223 struct sched_domain_topology_level *tl;
7224 int j;
7225
7226 for (tl = sched_domain_topology; tl->init; tl++) {
7227 struct sd_data *sdd = &tl->data;
7228
7229 for_each_cpu(j, cpu_map) {
7230 kfree(*per_cpu_ptr(sdd->sd, j));
7231 kfree(*per_cpu_ptr(sdd->sg, j));
7232 }
7233 free_percpu(sdd->sd);
7234 free_percpu(sdd->sg);
7275 } 7235 }
7276} 7236}
7277 7237
7238struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7239 struct s_data *d, const struct cpumask *cpu_map,
7240 struct sched_domain_attr *attr, struct sched_domain *child,
7241 int cpu)
7242{
7243 struct sched_domain *sd = tl->init(tl, cpu);
7244 if (!sd)
7245 return child;
7246
7247 set_domain_attribute(sd, attr);
7248 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7249 if (child) {
7250 sd->level = child->level + 1;
7251 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7252 child->parent = sd;
7253 }
7254 sd->child = child;
7255
7256 return sd;
7257}
7258
7278/* 7259/*
7279 * Build sched domains for a given set of cpus and attach the sched domains 7260 * Build sched domains for a given set of cpus and attach the sched domains
7280 * to the individual cpus 7261 * to the individual cpus
7281 */ 7262 */
7282static int __build_sched_domains(const struct cpumask *cpu_map, 7263static int build_sched_domains(const struct cpumask *cpu_map,
7283 struct sched_domain_attr *attr) 7264 struct sched_domain_attr *attr)
7284{ 7265{
7285 enum s_alloc alloc_state = sa_none; 7266 enum s_alloc alloc_state = sa_none;
7286 struct s_data d;
7287 struct sched_domain *sd; 7267 struct sched_domain *sd;
7288 int i; 7268 struct s_data d;
7289#ifdef CONFIG_NUMA 7269 int i, ret = -ENOMEM;
7290 d.sd_allnodes = 0;
7291#endif
7292 7270
7293 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7271 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7294 if (alloc_state != sa_rootdomain) 7272 if (alloc_state != sa_rootdomain)
7295 goto error; 7273 goto error;
7296 alloc_state = sa_sched_groups;
7297 7274
7298 /* 7275 /* Set up domains for cpus specified by the cpu_map. */
7299 * Set up domains for cpus specified by the cpu_map.
7300 */
7301 for_each_cpu(i, cpu_map) { 7276 for_each_cpu(i, cpu_map) {
7302 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7277 struct sched_domain_topology_level *tl;
7303 cpu_map);
7304
7305 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7306 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7307 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7308 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7309 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7310 }
7311
7312 for_each_cpu(i, cpu_map) {
7313 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7314 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7315 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7316 }
7317 7278
7318 /* Set up physical groups */ 7279 sd = NULL;
7319 for (i = 0; i < nr_node_ids; i++) 7280 for (tl = sched_domain_topology; tl->init; tl++)
7320 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7281 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7321 7282
7322#ifdef CONFIG_NUMA 7283 while (sd->child)
7323 /* Set up node groups */ 7284 sd = sd->child;
7324 if (d.sd_allnodes)
7325 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7326
7327 for (i = 0; i < nr_node_ids; i++)
7328 if (build_numa_sched_groups(&d, cpu_map, i))
7329 goto error;
7330#endif
7331 7285
7332 /* Calculate CPU power for physical packages and nodes */ 7286 *per_cpu_ptr(d.sd, i) = sd;
7333#ifdef CONFIG_SCHED_SMT
7334 for_each_cpu(i, cpu_map) {
7335 sd = &per_cpu(cpu_domains, i).sd;
7336 init_sched_groups_power(i, sd);
7337 }
7338#endif
7339#ifdef CONFIG_SCHED_MC
7340 for_each_cpu(i, cpu_map) {
7341 sd = &per_cpu(core_domains, i).sd;
7342 init_sched_groups_power(i, sd);
7343 }
7344#endif
7345#ifdef CONFIG_SCHED_BOOK
7346 for_each_cpu(i, cpu_map) {
7347 sd = &per_cpu(book_domains, i).sd;
7348 init_sched_groups_power(i, sd);
7349 } 7287 }
7350#endif
7351 7288
7289 /* Build the groups for the domains */
7352 for_each_cpu(i, cpu_map) { 7290 for_each_cpu(i, cpu_map) {
7353 sd = &per_cpu(phys_domains, i).sd; 7291 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7354 init_sched_groups_power(i, sd); 7292 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7355 } 7293 get_group(i, sd->private, &sd->groups);
7294 atomic_inc(&sd->groups->ref);
7356 7295
7357#ifdef CONFIG_NUMA 7296 if (i != cpumask_first(sched_domain_span(sd)))
7358 for (i = 0; i < nr_node_ids; i++) 7297 continue;
7359 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7360 7298
7361 if (d.sd_allnodes) { 7299 build_sched_groups(sd);
7362 struct sched_group *sg; 7300 }
7301 }
7363 7302
7364 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7303 /* Calculate CPU power for physical packages and nodes */
7365 d.tmpmask); 7304 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7366 init_numa_sched_groups_power(sg); 7305 if (!cpumask_test_cpu(i, cpu_map))
7306 continue;
7307
7308 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7309 claim_allocations(i, sd);
7310 init_sched_groups_power(i, sd);
7311 }
7367 } 7312 }
7368#endif
7369 7313
7370 /* Attach the domains */ 7314 /* Attach the domains */
7315 rcu_read_lock();
7371 for_each_cpu(i, cpu_map) { 7316 for_each_cpu(i, cpu_map) {
7372#ifdef CONFIG_SCHED_SMT 7317 sd = *per_cpu_ptr(d.sd, i);
7373 sd = &per_cpu(cpu_domains, i).sd;
7374#elif defined(CONFIG_SCHED_MC)
7375 sd = &per_cpu(core_domains, i).sd;
7376#elif defined(CONFIG_SCHED_BOOK)
7377 sd = &per_cpu(book_domains, i).sd;
7378#else
7379 sd = &per_cpu(phys_domains, i).sd;
7380#endif
7381 cpu_attach_domain(sd, d.rd, i); 7318 cpu_attach_domain(sd, d.rd, i);
7382 } 7319 }
7320 rcu_read_unlock();
7383 7321
7384 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7322 ret = 0;
7385 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7386 return 0;
7387
7388error: 7323error:
7389 __free_domain_allocs(&d, alloc_state, cpu_map); 7324 __free_domain_allocs(&d, alloc_state, cpu_map);
7390 return -ENOMEM; 7325 return ret;
7391}
7392
7393static int build_sched_domains(const struct cpumask *cpu_map)
7394{
7395 return __build_sched_domains(cpu_map, NULL);
7396} 7326}
7397 7327
7398static cpumask_var_t *doms_cur; /* current sched domains */ 7328static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7447,7 +7377,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7447 * For now this just excludes isolated cpus, but could be used to 7377 * For now this just excludes isolated cpus, but could be used to
7448 * exclude other special cases in the future. 7378 * exclude other special cases in the future.
7449 */ 7379 */
7450static int arch_init_sched_domains(const struct cpumask *cpu_map) 7380static int init_sched_domains(const struct cpumask *cpu_map)
7451{ 7381{
7452 int err; 7382 int err;
7453 7383
@@ -7458,32 +7388,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7458 doms_cur = &fallback_doms; 7388 doms_cur = &fallback_doms;
7459 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7389 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7460 dattr_cur = NULL; 7390 dattr_cur = NULL;
7461 err = build_sched_domains(doms_cur[0]); 7391 err = build_sched_domains(doms_cur[0], NULL);
7462 register_sched_domain_sysctl(); 7392 register_sched_domain_sysctl();
7463 7393
7464 return err; 7394 return err;
7465} 7395}
7466 7396
7467static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7468 struct cpumask *tmpmask)
7469{
7470 free_sched_groups(cpu_map, tmpmask);
7471}
7472
7473/* 7397/*
7474 * Detach sched domains from a group of cpus specified in cpu_map 7398 * Detach sched domains from a group of cpus specified in cpu_map
7475 * These cpus will now be attached to the NULL domain 7399 * These cpus will now be attached to the NULL domain
7476 */ 7400 */
7477static void detach_destroy_domains(const struct cpumask *cpu_map) 7401static void detach_destroy_domains(const struct cpumask *cpu_map)
7478{ 7402{
7479 /* Save because hotplug lock held. */
7480 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7481 int i; 7403 int i;
7482 7404
7405 rcu_read_lock();
7483 for_each_cpu(i, cpu_map) 7406 for_each_cpu(i, cpu_map)
7484 cpu_attach_domain(NULL, &def_root_domain, i); 7407 cpu_attach_domain(NULL, &def_root_domain, i);
7485 synchronize_sched(); 7408 rcu_read_unlock();
7486 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7487} 7409}
7488 7410
7489/* handle null as "default" */ 7411/* handle null as "default" */
@@ -7572,8 +7494,7 @@ match1:
7572 goto match2; 7494 goto match2;
7573 } 7495 }
7574 /* no match - add a new doms_new */ 7496 /* no match - add a new doms_new */
7575 __build_sched_domains(doms_new[i], 7497 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7576 dattr_new ? dattr_new + i : NULL);
7577match2: 7498match2:
7578 ; 7499 ;
7579 } 7500 }
@@ -7592,7 +7513,7 @@ match2:
7592} 7513}
7593 7514
7594#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7515#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7595static void arch_reinit_sched_domains(void) 7516static void reinit_sched_domains(void)
7596{ 7517{
7597 get_online_cpus(); 7518 get_online_cpus();
7598 7519
@@ -7625,7 +7546,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7625 else 7546 else
7626 sched_mc_power_savings = level; 7547 sched_mc_power_savings = level;
7627 7548
7628 arch_reinit_sched_domains(); 7549 reinit_sched_domains();
7629 7550
7630 return count; 7551 return count;
7631} 7552}
@@ -7744,14 +7665,9 @@ void __init sched_init_smp(void)
7744 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7665 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7745 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7666 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7746 7667
7747#if defined(CONFIG_NUMA)
7748 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7749 GFP_KERNEL);
7750 BUG_ON(sched_group_nodes_bycpu == NULL);
7751#endif
7752 get_online_cpus(); 7668 get_online_cpus();
7753 mutex_lock(&sched_domains_mutex); 7669 mutex_lock(&sched_domains_mutex);
7754 arch_init_sched_domains(cpu_active_mask); 7670 init_sched_domains(cpu_active_mask);
7755 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7671 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7756 if (cpumask_empty(non_isolated_cpus)) 7672 if (cpumask_empty(non_isolated_cpus))
7757 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7673 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7796,6 +7712,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7796 INIT_LIST_HEAD(&cfs_rq->tasks); 7712 INIT_LIST_HEAD(&cfs_rq->tasks);
7797#ifdef CONFIG_FAIR_GROUP_SCHED 7713#ifdef CONFIG_FAIR_GROUP_SCHED
7798 cfs_rq->rq = rq; 7714 cfs_rq->rq = rq;
7715 /* allow initial update_cfs_load() to truncate */
7716#ifdef CONFIG_SMP
7717 cfs_rq->load_stamp = 1;
7718#endif
7799#endif 7719#endif
7800 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7720 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7801} 7721}
@@ -7997,7 +7917,7 @@ void __init sched_init(void)
7997#ifdef CONFIG_SMP 7917#ifdef CONFIG_SMP
7998 rq->sd = NULL; 7918 rq->sd = NULL;
7999 rq->rd = NULL; 7919 rq->rd = NULL;
8000 rq->cpu_power = SCHED_LOAD_SCALE; 7920 rq->cpu_power = SCHED_POWER_SCALE;
8001 rq->post_schedule = 0; 7921 rq->post_schedule = 0;
8002 rq->active_balance = 0; 7922 rq->active_balance = 0;
8003 rq->next_balance = jiffies; 7923 rq->next_balance = jiffies;
@@ -8054,6 +7974,7 @@ void __init sched_init(void)
8054 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7974 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8055 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7975 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8056#ifdef CONFIG_SMP 7976#ifdef CONFIG_SMP
7977 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8057#ifdef CONFIG_NO_HZ 7978#ifdef CONFIG_NO_HZ
8058 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7979 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8059 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7980 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8074,7 +7995,7 @@ static inline int preempt_count_equals(int preempt_offset)
8074{ 7995{
8075 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7996 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8076 7997
8077 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7998 return (nested == preempt_offset);
8078} 7999}
8079 8000
8080void __might_sleep(const char *file, int line, int preempt_offset) 8001void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,9 +8030,11 @@ EXPORT_SYMBOL(__might_sleep);
8109#ifdef CONFIG_MAGIC_SYSRQ 8030#ifdef CONFIG_MAGIC_SYSRQ
8110static void normalize_task(struct rq *rq, struct task_struct *p) 8031static void normalize_task(struct rq *rq, struct task_struct *p)
8111{ 8032{
8033 const struct sched_class *prev_class = p->sched_class;
8034 int old_prio = p->prio;
8112 int on_rq; 8035 int on_rq;
8113 8036
8114 on_rq = p->se.on_rq; 8037 on_rq = p->on_rq;
8115 if (on_rq) 8038 if (on_rq)
8116 deactivate_task(rq, p, 0); 8039 deactivate_task(rq, p, 0);
8117 __setscheduler(rq, p, SCHED_NORMAL, 0); 8040 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8119,6 +8042,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8119 activate_task(rq, p, 0); 8042 activate_task(rq, p, 0);
8120 resched_task(rq->curr); 8043 resched_task(rq->curr);
8121 } 8044 }
8045
8046 check_class_changed(rq, p, prev_class, old_prio);
8122} 8047}
8123 8048
8124void normalize_rt_tasks(void) 8049void normalize_rt_tasks(void)
@@ -8234,7 +8159,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8234{ 8159{
8235 struct cfs_rq *cfs_rq; 8160 struct cfs_rq *cfs_rq;
8236 struct sched_entity *se; 8161 struct sched_entity *se;
8237 struct rq *rq;
8238 int i; 8162 int i;
8239 8163
8240 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8164 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8171,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8247 tg->shares = NICE_0_LOAD; 8171 tg->shares = NICE_0_LOAD;
8248 8172
8249 for_each_possible_cpu(i) { 8173 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8174 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8175 GFP_KERNEL, cpu_to_node(i));
8254 if (!cfs_rq) 8176 if (!cfs_rq)
@@ -8325,7 +8247,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8325{ 8247{
8326 struct rt_rq *rt_rq; 8248 struct rt_rq *rt_rq;
8327 struct sched_rt_entity *rt_se; 8249 struct sched_rt_entity *rt_se;
8328 struct rq *rq;
8329 int i; 8250 int i;
8330 8251
8331 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8252 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8339,8 +8260,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8339 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8260 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8340 8261
8341 for_each_possible_cpu(i) { 8262 for_each_possible_cpu(i) {
8342 rq = cpu_rq(i);
8343
8344 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8263 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8345 GFP_KERNEL, cpu_to_node(i)); 8264 GFP_KERNEL, cpu_to_node(i));
8346 if (!rt_rq) 8265 if (!rt_rq)
@@ -8455,7 +8374,7 @@ void sched_move_task(struct task_struct *tsk)
8455 rq = task_rq_lock(tsk, &flags); 8374 rq = task_rq_lock(tsk, &flags);
8456 8375
8457 running = task_current(rq, tsk); 8376 running = task_current(rq, tsk);
8458 on_rq = tsk->se.on_rq; 8377 on_rq = tsk->on_rq;
8459 8378
8460 if (on_rq) 8379 if (on_rq)
8461 dequeue_task(rq, tsk, 0); 8380 dequeue_task(rq, tsk, 0);
@@ -8474,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
8474 if (on_rq) 8393 if (on_rq)
8475 enqueue_task(rq, tsk, 0); 8394 enqueue_task(rq, tsk, 0);
8476 8395
8477 task_rq_unlock(rq, &flags); 8396 task_rq_unlock(rq, tsk, &flags);
8478} 8397}
8479#endif /* CONFIG_CGROUP_SCHED */ 8398#endif /* CONFIG_CGROUP_SCHED */
8480 8399
@@ -8510,7 +8429,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8429 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8430 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8431 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8432 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8433 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8434 }
8516 8435
@@ -8845,46 +8764,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8845 return 0; 8764 return 0;
8846} 8765}
8847 8766
8848static int
8849cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8850 struct task_struct *tsk, bool threadgroup)
8851{
8852 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
8853 if (retval)
8854 return retval;
8855 if (threadgroup) {
8856 struct task_struct *c;
8857 rcu_read_lock();
8858 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8859 retval = cpu_cgroup_can_attach_task(cgrp, c);
8860 if (retval) {
8861 rcu_read_unlock();
8862 return retval;
8863 }
8864 }
8865 rcu_read_unlock();
8866 }
8867 return 0;
8868}
8869
8870static void 8767static void
8871cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8768cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8872 struct cgroup *old_cont, struct task_struct *tsk,
8873 bool threadgroup)
8874{ 8769{
8875 sched_move_task(tsk); 8770 sched_move_task(tsk);
8876 if (threadgroup) {
8877 struct task_struct *c;
8878 rcu_read_lock();
8879 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8880 sched_move_task(c);
8881 }
8882 rcu_read_unlock();
8883 }
8884} 8771}
8885 8772
8886static void 8773static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 8774cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8775 struct cgroup *old_cgrp, struct task_struct *task)
8888{ 8776{
8889 /* 8777 /*
8890 * cgroup_exit() is called in the copy_process() failure path. 8778 * cgroup_exit() is called in the copy_process() failure path.
@@ -8901,14 +8789,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8789static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8902 u64 shareval) 8790 u64 shareval)
8903{ 8791{
8904 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8792 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8905} 8793}
8906 8794
8907static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8795static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8908{ 8796{
8909 struct task_group *tg = cgroup_tg(cgrp); 8797 struct task_group *tg = cgroup_tg(cgrp);
8910 8798
8911 return (u64) tg->shares; 8799 return (u64) scale_load_down(tg->shares);
8912} 8800}
8913#endif /* CONFIG_FAIR_GROUP_SCHED */ 8801#endif /* CONFIG_FAIR_GROUP_SCHED */
8914 8802
@@ -8967,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8967 .name = "cpu", 8855 .name = "cpu",
8968 .create = cpu_cgroup_create, 8856 .create = cpu_cgroup_create,
8969 .destroy = cpu_cgroup_destroy, 8857 .destroy = cpu_cgroup_destroy,
8970 .can_attach = cpu_cgroup_can_attach, 8858 .can_attach_task = cpu_cgroup_can_attach_task,
8971 .attach = cpu_cgroup_attach, 8859 .attach_task = cpu_cgroup_attach_task,
8972 .exit = cpu_cgroup_exit, 8860 .exit = cpu_cgroup_exit,
8973 .populate = cpu_cgroup_populate, 8861 .populate = cpu_cgroup_populate,
8974 .subsys_id = cpu_cgroup_subsys_id, 8862 .subsys_id = cpu_cgroup_subsys_id,