diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 2122 |
1 files changed, 1005 insertions, 1117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..5e43e9dc65d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | ||
36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
@@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
232 | #endif | 231 | #endif |
233 | 232 | ||
234 | /* | 233 | /* |
235 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 234 | * sched_domains_mutex serializes calls to init_sched_domains, |
236 | * detach_destroy_domains and partition_sched_domains. | 235 | * detach_destroy_domains and partition_sched_domains. |
237 | */ | 236 | */ |
238 | static DEFINE_MUTEX(sched_domains_mutex); | 237 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
294 | * limitation from this.) | 293 | * limitation from this.) |
295 | */ | 294 | */ |
296 | #define MIN_SHARES 2 | 295 | #define MIN_SHARES 2 |
297 | #define MAX_SHARES (1UL << 18) | 296 | #define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) |
298 | 297 | ||
299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | 298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
300 | #endif | 299 | #endif |
@@ -313,6 +312,9 @@ struct cfs_rq { | |||
313 | 312 | ||
314 | u64 exec_clock; | 313 | u64 exec_clock; |
315 | u64 min_vruntime; | 314 | u64 min_vruntime; |
315 | #ifndef CONFIG_64BIT | ||
316 | u64 min_vruntime_copy; | ||
317 | #endif | ||
316 | 318 | ||
317 | struct rb_root tasks_timeline; | 319 | struct rb_root tasks_timeline; |
318 | struct rb_node *rb_leftmost; | 320 | struct rb_node *rb_leftmost; |
@@ -324,9 +326,11 @@ struct cfs_rq { | |||
324 | * 'curr' points to currently running entity on this cfs_rq. | 326 | * 'curr' points to currently running entity on this cfs_rq. |
325 | * It is set to NULL otherwise (i.e when none are currently running). | 327 | * It is set to NULL otherwise (i.e when none are currently running). |
326 | */ | 328 | */ |
327 | struct sched_entity *curr, *next, *last; | 329 | struct sched_entity *curr, *next, *last, *skip; |
328 | 330 | ||
331 | #ifdef CONFIG_SCHED_DEBUG | ||
329 | unsigned int nr_spread_over; | 332 | unsigned int nr_spread_over; |
333 | #endif | ||
330 | 334 | ||
331 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
332 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -418,6 +422,7 @@ struct rt_rq { | |||
418 | */ | 422 | */ |
419 | struct root_domain { | 423 | struct root_domain { |
420 | atomic_t refcount; | 424 | atomic_t refcount; |
425 | struct rcu_head rcu; | ||
421 | cpumask_var_t span; | 426 | cpumask_var_t span; |
422 | cpumask_var_t online; | 427 | cpumask_var_t online; |
423 | 428 | ||
@@ -461,7 +466,7 @@ struct rq { | |||
461 | u64 nohz_stamp; | 466 | u64 nohz_stamp; |
462 | unsigned char nohz_balance_kick; | 467 | unsigned char nohz_balance_kick; |
463 | #endif | 468 | #endif |
464 | unsigned int skip_clock_update; | 469 | int skip_clock_update; |
465 | 470 | ||
466 | /* capture load from *all* tasks on this cpu: */ | 471 | /* capture load from *all* tasks on this cpu: */ |
467 | struct load_weight load; | 472 | struct load_weight load; |
@@ -554,6 +559,10 @@ struct rq { | |||
554 | unsigned int ttwu_count; | 559 | unsigned int ttwu_count; |
555 | unsigned int ttwu_local; | 560 | unsigned int ttwu_local; |
556 | #endif | 561 | #endif |
562 | |||
563 | #ifdef CONFIG_SMP | ||
564 | struct task_struct *wake_list; | ||
565 | #endif | ||
557 | }; | 566 | }; |
558 | 567 | ||
559 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
@@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq) | |||
572 | 581 | ||
573 | #define rcu_dereference_check_sched_domain(p) \ | 582 | #define rcu_dereference_check_sched_domain(p) \ |
574 | rcu_dereference_check((p), \ | 583 | rcu_dereference_check((p), \ |
575 | rcu_read_lock_sched_held() || \ | 584 | rcu_read_lock_held() || \ |
576 | lockdep_is_held(&sched_domains_mutex)) | 585 | lockdep_is_held(&sched_domains_mutex)) |
577 | 586 | ||
578 | /* | 587 | /* |
@@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq) | |||
597 | * Return the group to which this tasks belongs. | 606 | * Return the group to which this tasks belongs. |
598 | * | 607 | * |
599 | * We use task_subsys_state_check() and extend the RCU verification | 608 | * We use task_subsys_state_check() and extend the RCU verification |
600 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 609 | * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() |
601 | * holds that lock for each task it moves into the cgroup. Therefore | 610 | * holds that lock for each task it moves into the cgroup. Therefore |
602 | * by holding that lock, we pin the task to the current cgroup. | 611 | * by holding that lock, we pin the task to the current cgroup. |
603 | */ | 612 | */ |
@@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct task_group *tg; | 615 | struct task_group *tg; |
607 | struct cgroup_subsys_state *css; | 616 | struct cgroup_subsys_state *css; |
608 | 617 | ||
609 | if (p->flags & PF_EXITING) | ||
610 | return &root_task_group; | ||
611 | |||
612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 618 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
613 | lockdep_is_held(&task_rq(p)->lock)); | 619 | lockdep_is_held(&p->pi_lock)); |
614 | tg = container_of(css, struct task_group, css); | 620 | tg = container_of(css, struct task_group, css); |
615 | 621 | ||
616 | return autogroup_task_group(p, tg); | 622 | return autogroup_task_group(p, tg); |
@@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq) | |||
646 | { | 652 | { |
647 | s64 delta; | 653 | s64 delta; |
648 | 654 | ||
649 | if (rq->skip_clock_update) | 655 | if (rq->skip_clock_update > 0) |
650 | return; | 656 | return; |
651 | 657 | ||
652 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 658 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
@@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq) | |||
664 | #endif | 670 | #endif |
665 | 671 | ||
666 | /** | 672 | /** |
667 | * runqueue_is_locked | 673 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
668 | * @cpu: the processor in question. | 674 | * @cpu: the processor in question. |
669 | * | 675 | * |
670 | * Returns true if the current cpu runqueue is locked. | ||
671 | * This interface allows printk to be called with the runqueue lock | 676 | * This interface allows printk to be called with the runqueue lock |
672 | * held and know whether or not it is OK to wake up the klogd. | 677 | * held and know whether or not it is OK to wake up the klogd. |
673 | */ | 678 | */ |
@@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
843 | return rq->curr == p; | 848 | return rq->curr == p; |
844 | } | 849 | } |
845 | 850 | ||
846 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
847 | static inline int task_running(struct rq *rq, struct task_struct *p) | 851 | static inline int task_running(struct rq *rq, struct task_struct *p) |
848 | { | 852 | { |
853 | #ifdef CONFIG_SMP | ||
854 | return p->on_cpu; | ||
855 | #else | ||
849 | return task_current(rq, p); | 856 | return task_current(rq, p); |
857 | #endif | ||
850 | } | 858 | } |
851 | 859 | ||
860 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
852 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 861 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
853 | { | 862 | { |
863 | #ifdef CONFIG_SMP | ||
864 | /* | ||
865 | * We can optimise this out completely for !SMP, because the | ||
866 | * SMP rebalancing from interrupt is the only thing that cares | ||
867 | * here. | ||
868 | */ | ||
869 | next->on_cpu = 1; | ||
870 | #endif | ||
854 | } | 871 | } |
855 | 872 | ||
856 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 873 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
857 | { | 874 | { |
875 | #ifdef CONFIG_SMP | ||
876 | /* | ||
877 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
878 | * We must ensure this doesn't happen until the switch is completely | ||
879 | * finished. | ||
880 | */ | ||
881 | smp_wmb(); | ||
882 | prev->on_cpu = 0; | ||
883 | #endif | ||
858 | #ifdef CONFIG_DEBUG_SPINLOCK | 884 | #ifdef CONFIG_DEBUG_SPINLOCK |
859 | /* this is a valid case when another task releases the spinlock */ | 885 | /* this is a valid case when another task releases the spinlock */ |
860 | rq->lock.owner = current; | 886 | rq->lock.owner = current; |
@@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
870 | } | 896 | } |
871 | 897 | ||
872 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 898 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
873 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
874 | { | ||
875 | #ifdef CONFIG_SMP | ||
876 | return p->oncpu; | ||
877 | #else | ||
878 | return task_current(rq, p); | ||
879 | #endif | ||
880 | } | ||
881 | |||
882 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 899 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
883 | { | 900 | { |
884 | #ifdef CONFIG_SMP | 901 | #ifdef CONFIG_SMP |
@@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
887 | * SMP rebalancing from interrupt is the only thing that cares | 904 | * SMP rebalancing from interrupt is the only thing that cares |
888 | * here. | 905 | * here. |
889 | */ | 906 | */ |
890 | next->oncpu = 1; | 907 | next->on_cpu = 1; |
891 | #endif | 908 | #endif |
892 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 909 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
893 | raw_spin_unlock_irq(&rq->lock); | 910 | raw_spin_unlock_irq(&rq->lock); |
@@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
900 | { | 917 | { |
901 | #ifdef CONFIG_SMP | 918 | #ifdef CONFIG_SMP |
902 | /* | 919 | /* |
903 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 920 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
904 | * We must ensure this doesn't happen until the switch is completely | 921 | * We must ensure this doesn't happen until the switch is completely |
905 | * finished. | 922 | * finished. |
906 | */ | 923 | */ |
907 | smp_wmb(); | 924 | smp_wmb(); |
908 | prev->oncpu = 0; | 925 | prev->on_cpu = 0; |
909 | #endif | 926 | #endif |
910 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 927 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
911 | local_irq_enable(); | 928 | local_irq_enable(); |
@@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
914 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 931 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
915 | 932 | ||
916 | /* | 933 | /* |
917 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 934 | * __task_rq_lock - lock the rq @p resides on. |
918 | * against ttwu(). | ||
919 | */ | ||
920 | static inline int task_is_waking(struct task_struct *p) | ||
921 | { | ||
922 | return unlikely(p->state == TASK_WAKING); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
927 | * Must be called interrupts disabled. | ||
928 | */ | 935 | */ |
929 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 936 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
930 | __acquires(rq->lock) | 937 | __acquires(rq->lock) |
931 | { | 938 | { |
932 | struct rq *rq; | 939 | struct rq *rq; |
933 | 940 | ||
941 | lockdep_assert_held(&p->pi_lock); | ||
942 | |||
934 | for (;;) { | 943 | for (;;) { |
935 | rq = task_rq(p); | 944 | rq = task_rq(p); |
936 | raw_spin_lock(&rq->lock); | 945 | raw_spin_lock(&rq->lock); |
@@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
941 | } | 950 | } |
942 | 951 | ||
943 | /* | 952 | /* |
944 | * task_rq_lock - lock the runqueue a given task resides on and disable | 953 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
945 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
946 | * explicitly disabling preemption. | ||
947 | */ | 954 | */ |
948 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 955 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
956 | __acquires(p->pi_lock) | ||
949 | __acquires(rq->lock) | 957 | __acquires(rq->lock) |
950 | { | 958 | { |
951 | struct rq *rq; | 959 | struct rq *rq; |
952 | 960 | ||
953 | for (;;) { | 961 | for (;;) { |
954 | local_irq_save(*flags); | 962 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
955 | rq = task_rq(p); | 963 | rq = task_rq(p); |
956 | raw_spin_lock(&rq->lock); | 964 | raw_spin_lock(&rq->lock); |
957 | if (likely(rq == task_rq(p))) | 965 | if (likely(rq == task_rq(p))) |
958 | return rq; | 966 | return rq; |
959 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 967 | raw_spin_unlock(&rq->lock); |
968 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
960 | } | 969 | } |
961 | } | 970 | } |
962 | 971 | ||
@@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
966 | raw_spin_unlock(&rq->lock); | 975 | raw_spin_unlock(&rq->lock); |
967 | } | 976 | } |
968 | 977 | ||
969 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 978 | static inline void |
979 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
970 | __releases(rq->lock) | 980 | __releases(rq->lock) |
981 | __releases(p->pi_lock) | ||
971 | { | 982 | { |
972 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 983 | raw_spin_unlock(&rq->lock); |
984 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
973 | } | 985 | } |
974 | 986 | ||
975 | /* | 987 | /* |
@@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void) | |||
1198 | int i; | 1210 | int i; |
1199 | struct sched_domain *sd; | 1211 | struct sched_domain *sd; |
1200 | 1212 | ||
1213 | rcu_read_lock(); | ||
1201 | for_each_domain(cpu, sd) { | 1214 | for_each_domain(cpu, sd) { |
1202 | for_each_cpu(i, sched_domain_span(sd)) | 1215 | for_each_cpu(i, sched_domain_span(sd)) { |
1203 | if (!idle_cpu(i)) | 1216 | if (!idle_cpu(i)) { |
1204 | return i; | 1217 | cpu = i; |
1218 | goto unlock; | ||
1219 | } | ||
1220 | } | ||
1205 | } | 1221 | } |
1222 | unlock: | ||
1223 | rcu_read_unlock(); | ||
1206 | return cpu; | 1224 | return cpu; |
1207 | } | 1225 | } |
1208 | /* | 1226 | /* |
@@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1312 | { | 1330 | { |
1313 | u64 tmp; | 1331 | u64 tmp; |
1314 | 1332 | ||
1333 | /* | ||
1334 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1335 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1336 | * 2^SCHED_LOAD_RESOLUTION. | ||
1337 | */ | ||
1338 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1339 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1340 | else | ||
1341 | tmp = (u64)delta_exec; | ||
1342 | |||
1315 | if (!lw->inv_weight) { | 1343 | if (!lw->inv_weight) { |
1316 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1344 | unsigned long w = scale_load_down(lw->weight); |
1345 | |||
1346 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1317 | lw->inv_weight = 1; | 1347 | lw->inv_weight = 1; |
1348 | else if (unlikely(!w)) | ||
1349 | lw->inv_weight = WMULT_CONST; | ||
1318 | else | 1350 | else |
1319 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1351 | lw->inv_weight = WMULT_CONST / w; |
1320 | / (lw->weight+1); | ||
1321 | } | 1352 | } |
1322 | 1353 | ||
1323 | tmp = (u64)delta_exec * weight; | ||
1324 | /* | 1354 | /* |
1325 | * Check whether we'd overflow the 64-bit multiplication: | 1355 | * Check whether we'd overflow the 64-bit multiplication: |
1326 | */ | 1356 | */ |
@@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1686 | __release(rq2->lock); | 1716 | __release(rq2->lock); |
1687 | } | 1717 | } |
1688 | 1718 | ||
1719 | #else /* CONFIG_SMP */ | ||
1720 | |||
1721 | /* | ||
1722 | * double_rq_lock - safely lock two runqueues | ||
1723 | * | ||
1724 | * Note this does not disable interrupts like task_rq_lock, | ||
1725 | * you need to do so manually before calling. | ||
1726 | */ | ||
1727 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1728 | __acquires(rq1->lock) | ||
1729 | __acquires(rq2->lock) | ||
1730 | { | ||
1731 | BUG_ON(!irqs_disabled()); | ||
1732 | BUG_ON(rq1 != rq2); | ||
1733 | raw_spin_lock(&rq1->lock); | ||
1734 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1735 | } | ||
1736 | |||
1737 | /* | ||
1738 | * double_rq_unlock - safely unlock two runqueues | ||
1739 | * | ||
1740 | * Note this does not restore interrupts like task_rq_unlock, | ||
1741 | * you need to do so manually after calling. | ||
1742 | */ | ||
1743 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1744 | __releases(rq1->lock) | ||
1745 | __releases(rq2->lock) | ||
1746 | { | ||
1747 | BUG_ON(rq1 != rq2); | ||
1748 | raw_spin_unlock(&rq1->lock); | ||
1749 | __release(rq2->lock); | ||
1750 | } | ||
1751 | |||
1689 | #endif | 1752 | #endif |
1690 | 1753 | ||
1691 | static void calc_load_account_idle(struct rq *this_rq); | 1754 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq) | |||
1727 | 1790 | ||
1728 | static void set_load_weight(struct task_struct *p) | 1791 | static void set_load_weight(struct task_struct *p) |
1729 | { | 1792 | { |
1793 | int prio = p->static_prio - MAX_RT_PRIO; | ||
1794 | struct load_weight *load = &p->se.load; | ||
1795 | |||
1730 | /* | 1796 | /* |
1731 | * SCHED_IDLE tasks get minimal weight: | 1797 | * SCHED_IDLE tasks get minimal weight: |
1732 | */ | 1798 | */ |
1733 | if (p->policy == SCHED_IDLE) { | 1799 | if (p->policy == SCHED_IDLE) { |
1734 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1800 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1735 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1801 | load->inv_weight = WMULT_IDLEPRIO; |
1736 | return; | 1802 | return; |
1737 | } | 1803 | } |
1738 | 1804 | ||
1739 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1805 | load->weight = scale_load(prio_to_weight[prio]); |
1740 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1806 | load->inv_weight = prio_to_wmult[prio]; |
1741 | } | 1807 | } |
1742 | 1808 | ||
1743 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1809 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1745 | update_rq_clock(rq); | 1811 | update_rq_clock(rq); |
1746 | sched_info_queued(p); | 1812 | sched_info_queued(p); |
1747 | p->sched_class->enqueue_task(rq, p, flags); | 1813 | p->sched_class->enqueue_task(rq, p, flags); |
1748 | p->se.on_rq = 1; | ||
1749 | } | 1814 | } |
1750 | 1815 | ||
1751 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1816 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1753 | update_rq_clock(rq); | 1818 | update_rq_clock(rq); |
1754 | sched_info_dequeued(p); | 1819 | sched_info_dequeued(p); |
1755 | p->sched_class->dequeue_task(rq, p, flags); | 1820 | p->sched_class->dequeue_task(rq, p, flags); |
1756 | p->se.on_rq = 0; | ||
1757 | } | 1821 | } |
1758 | 1822 | ||
1759 | /* | 1823 | /* |
@@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr) | |||
1880 | */ | 1944 | */ |
1881 | if (hardirq_count()) | 1945 | if (hardirq_count()) |
1882 | __this_cpu_add(cpu_hardirq_time, delta); | 1946 | __this_cpu_add(cpu_hardirq_time, delta); |
1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1947 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
1884 | __this_cpu_add(cpu_softirq_time, delta); | 1948 | __this_cpu_add(cpu_softirq_time, delta); |
1885 | 1949 | ||
1886 | irq_time_write_end(); | 1950 | irq_time_write_end(); |
@@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1920 | sched_rt_avg_update(rq, irq_delta); | 1984 | sched_rt_avg_update(rq, irq_delta); |
1921 | } | 1985 | } |
1922 | 1986 | ||
1987 | static int irqtime_account_hi_update(void) | ||
1988 | { | ||
1989 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1990 | unsigned long flags; | ||
1991 | u64 latest_ns; | ||
1992 | int ret = 0; | ||
1993 | |||
1994 | local_irq_save(flags); | ||
1995 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
1996 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
1997 | ret = 1; | ||
1998 | local_irq_restore(flags); | ||
1999 | return ret; | ||
2000 | } | ||
2001 | |||
2002 | static int irqtime_account_si_update(void) | ||
2003 | { | ||
2004 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2005 | unsigned long flags; | ||
2006 | u64 latest_ns; | ||
2007 | int ret = 0; | ||
2008 | |||
2009 | local_irq_save(flags); | ||
2010 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
2011 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
2012 | ret = 1; | ||
2013 | local_irq_restore(flags); | ||
2014 | return ret; | ||
2015 | } | ||
2016 | |||
1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 2017 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1924 | 2018 | ||
2019 | #define sched_clock_irqtime (0) | ||
2020 | |||
1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 2021 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1926 | { | 2022 | { |
1927 | rq->clock_task += delta; | 2023 | rq->clock_task += delta; |
@@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p) | |||
2025 | 2121 | ||
2026 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2122 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2027 | const struct sched_class *prev_class, | 2123 | const struct sched_class *prev_class, |
2028 | int oldprio, int running) | 2124 | int oldprio) |
2029 | { | 2125 | { |
2030 | if (prev_class != p->sched_class) { | 2126 | if (prev_class != p->sched_class) { |
2031 | if (prev_class->switched_from) | 2127 | if (prev_class->switched_from) |
2032 | prev_class->switched_from(rq, p, running); | 2128 | prev_class->switched_from(rq, p); |
2033 | p->sched_class->switched_to(rq, p, running); | 2129 | p->sched_class->switched_to(rq, p); |
2034 | } else | 2130 | } else if (oldprio != p->prio) |
2035 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2131 | p->sched_class->prio_changed(rq, p, oldprio); |
2036 | } | 2132 | } |
2037 | 2133 | ||
2038 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2134 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
@@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2056 | * A queue event has occurred, and we're going to schedule. In | 2152 | * A queue event has occurred, and we're going to schedule. In |
2057 | * this case, we can save a useless back to back clock update. | 2153 | * this case, we can save a useless back to back clock update. |
2058 | */ | 2154 | */ |
2059 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | 2155 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
2060 | rq->skip_clock_update = 1; | 2156 | rq->skip_clock_update = 1; |
2061 | } | 2157 | } |
2062 | 2158 | ||
@@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2102 | */ | 2198 | */ |
2103 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2199 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2104 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2200 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2201 | |||
2202 | #ifdef CONFIG_LOCKDEP | ||
2203 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2204 | lockdep_is_held(&task_rq(p)->lock))); | ||
2205 | #endif | ||
2105 | #endif | 2206 | #endif |
2106 | 2207 | ||
2107 | trace_sched_migrate_task(p, new_cpu); | 2208 | trace_sched_migrate_task(p, new_cpu); |
@@ -2122,19 +2223,6 @@ struct migration_arg { | |||
2122 | static int migration_cpu_stop(void *data); | 2223 | static int migration_cpu_stop(void *data); |
2123 | 2224 | ||
2124 | /* | 2225 | /* |
2125 | * The task's runqueue lock must be held. | ||
2126 | * Returns true if you have to wait for migration thread. | ||
2127 | */ | ||
2128 | static bool migrate_task(struct task_struct *p, struct rq *rq) | ||
2129 | { | ||
2130 | /* | ||
2131 | * If the task is not on a runqueue (and not running), then | ||
2132 | * the next wake-up will properly place the task. | ||
2133 | */ | ||
2134 | return p->se.on_rq || task_running(rq, p); | ||
2135 | } | ||
2136 | |||
2137 | /* | ||
2138 | * wait_task_inactive - wait for a thread to unschedule. | 2226 | * wait_task_inactive - wait for a thread to unschedule. |
2139 | * | 2227 | * |
2140 | * If @match_state is nonzero, it's the @p->state value just checked and | 2228 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2191 | rq = task_rq_lock(p, &flags); | 2279 | rq = task_rq_lock(p, &flags); |
2192 | trace_sched_wait_task(p); | 2280 | trace_sched_wait_task(p); |
2193 | running = task_running(rq, p); | 2281 | running = task_running(rq, p); |
2194 | on_rq = p->se.on_rq; | 2282 | on_rq = p->on_rq; |
2195 | ncsw = 0; | 2283 | ncsw = 0; |
2196 | if (!match_state || p->state == match_state) | 2284 | if (!match_state || p->state == match_state) |
2197 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2285 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2198 | task_rq_unlock(rq, &flags); | 2286 | task_rq_unlock(rq, p, &flags); |
2199 | 2287 | ||
2200 | /* | 2288 | /* |
2201 | * If it changed from the expected state, bail out now. | 2289 | * If it changed from the expected state, bail out now. |
@@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2224 | * yield - it could be a while. | 2312 | * yield - it could be a while. |
2225 | */ | 2313 | */ |
2226 | if (unlikely(on_rq)) { | 2314 | if (unlikely(on_rq)) { |
2227 | schedule_timeout_uninterruptible(1); | 2315 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2316 | |||
2317 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2318 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2228 | continue; | 2319 | continue; |
2229 | } | 2320 | } |
2230 | 2321 | ||
@@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2246 | * Cause a process which is running on another CPU to enter | 2337 | * Cause a process which is running on another CPU to enter |
2247 | * kernel-mode, without any delay. (to get signals handled.) | 2338 | * kernel-mode, without any delay. (to get signals handled.) |
2248 | * | 2339 | * |
2249 | * NOTE: this function doesnt have to take the runqueue lock, | 2340 | * NOTE: this function doesn't have to take the runqueue lock, |
2250 | * because all it wants to ensure is that the remote task enters | 2341 | * because all it wants to ensure is that the remote task enters |
2251 | * the kernel. If the IPI races and the task has been migrated | 2342 | * the kernel. If the IPI races and the task has been migrated |
2252 | * to another CPU then no harm is done and the purpose has been | 2343 | * to another CPU then no harm is done and the purpose has been |
@@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p) | |||
2265 | EXPORT_SYMBOL_GPL(kick_process); | 2356 | EXPORT_SYMBOL_GPL(kick_process); |
2266 | #endif /* CONFIG_SMP */ | 2357 | #endif /* CONFIG_SMP */ |
2267 | 2358 | ||
2268 | /** | ||
2269 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2270 | * @p: the task to evaluate | ||
2271 | * @func: the function to be called | ||
2272 | * @info: the function call argument | ||
2273 | * | ||
2274 | * Calls the function @func when the task is currently running. This might | ||
2275 | * be on the current CPU, which just calls the function directly | ||
2276 | */ | ||
2277 | void task_oncpu_function_call(struct task_struct *p, | ||
2278 | void (*func) (void *info), void *info) | ||
2279 | { | ||
2280 | int cpu; | ||
2281 | |||
2282 | preempt_disable(); | ||
2283 | cpu = task_cpu(p); | ||
2284 | if (task_curr(p)) | ||
2285 | smp_call_function_single(cpu, func, info, 1); | ||
2286 | preempt_enable(); | ||
2287 | } | ||
2288 | |||
2289 | #ifdef CONFIG_SMP | 2359 | #ifdef CONFIG_SMP |
2290 | /* | 2360 | /* |
2291 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2361 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2292 | */ | 2362 | */ |
2293 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2363 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2294 | { | 2364 | { |
@@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2321 | } | 2391 | } |
2322 | 2392 | ||
2323 | /* | 2393 | /* |
2324 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2394 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2325 | */ | 2395 | */ |
2326 | static inline | 2396 | static inline |
2327 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2397 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2328 | { | 2398 | { |
2329 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2399 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2330 | 2400 | ||
2331 | /* | 2401 | /* |
2332 | * In order not to call set_task_cpu() on a blocking task we need | 2402 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample) | |||
2352 | } | 2422 | } |
2353 | #endif | 2423 | #endif |
2354 | 2424 | ||
2355 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2425 | static void |
2356 | bool is_sync, bool is_migrate, bool is_local, | 2426 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2357 | unsigned long en_flags) | ||
2358 | { | 2427 | { |
2428 | #ifdef CONFIG_SCHEDSTATS | ||
2429 | struct rq *rq = this_rq(); | ||
2430 | |||
2431 | #ifdef CONFIG_SMP | ||
2432 | int this_cpu = smp_processor_id(); | ||
2433 | |||
2434 | if (cpu == this_cpu) { | ||
2435 | schedstat_inc(rq, ttwu_local); | ||
2436 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2437 | } else { | ||
2438 | struct sched_domain *sd; | ||
2439 | |||
2440 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2441 | rcu_read_lock(); | ||
2442 | for_each_domain(this_cpu, sd) { | ||
2443 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2444 | schedstat_inc(sd, ttwu_wake_remote); | ||
2445 | break; | ||
2446 | } | ||
2447 | } | ||
2448 | rcu_read_unlock(); | ||
2449 | } | ||
2450 | #endif /* CONFIG_SMP */ | ||
2451 | |||
2452 | schedstat_inc(rq, ttwu_count); | ||
2359 | schedstat_inc(p, se.statistics.nr_wakeups); | 2453 | schedstat_inc(p, se.statistics.nr_wakeups); |
2360 | if (is_sync) | 2454 | |
2455 | if (wake_flags & WF_SYNC) | ||
2361 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2456 | schedstat_inc(p, se.statistics.nr_wakeups_sync); |
2362 | if (is_migrate) | 2457 | |
2458 | if (cpu != task_cpu(p)) | ||
2363 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2459 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); |
2364 | if (is_local) | ||
2365 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2366 | else | ||
2367 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2368 | 2460 | ||
2461 | #endif /* CONFIG_SCHEDSTATS */ | ||
2462 | } | ||
2463 | |||
2464 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2465 | { | ||
2369 | activate_task(rq, p, en_flags); | 2466 | activate_task(rq, p, en_flags); |
2467 | p->on_rq = 1; | ||
2468 | |||
2469 | /* if a worker is waking up, notify workqueue */ | ||
2470 | if (p->flags & PF_WQ_WORKER) | ||
2471 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2370 | } | 2472 | } |
2371 | 2473 | ||
2372 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2474 | /* |
2373 | int wake_flags, bool success) | 2475 | * Mark the task runnable and perform wakeup-preemption. |
2476 | */ | ||
2477 | static void | ||
2478 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2374 | { | 2479 | { |
2375 | trace_sched_wakeup(p, success); | 2480 | trace_sched_wakeup(p, true); |
2376 | check_preempt_curr(rq, p, wake_flags); | 2481 | check_preempt_curr(rq, p, wake_flags); |
2377 | 2482 | ||
2378 | p->state = TASK_RUNNING; | 2483 | p->state = TASK_RUNNING; |
@@ -2391,9 +2496,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2391 | rq->idle_stamp = 0; | 2496 | rq->idle_stamp = 0; |
2392 | } | 2497 | } |
2393 | #endif | 2498 | #endif |
2394 | /* if a worker is waking up, notify workqueue */ | 2499 | } |
2395 | if ((p->flags & PF_WQ_WORKER) && success) | 2500 | |
2396 | wq_worker_waking_up(p, cpu_of(rq)); | 2501 | static void |
2502 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2503 | { | ||
2504 | #ifdef CONFIG_SMP | ||
2505 | if (p->sched_contributes_to_load) | ||
2506 | rq->nr_uninterruptible--; | ||
2507 | #endif | ||
2508 | |||
2509 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2510 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2511 | } | ||
2512 | |||
2513 | /* | ||
2514 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2515 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2516 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2517 | * the task is still ->on_rq. | ||
2518 | */ | ||
2519 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2520 | { | ||
2521 | struct rq *rq; | ||
2522 | int ret = 0; | ||
2523 | |||
2524 | rq = __task_rq_lock(p); | ||
2525 | if (p->on_rq) { | ||
2526 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2527 | ret = 1; | ||
2528 | } | ||
2529 | __task_rq_unlock(rq); | ||
2530 | |||
2531 | return ret; | ||
2532 | } | ||
2533 | |||
2534 | #ifdef CONFIG_SMP | ||
2535 | static void sched_ttwu_pending(void) | ||
2536 | { | ||
2537 | struct rq *rq = this_rq(); | ||
2538 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2539 | |||
2540 | if (!list) | ||
2541 | return; | ||
2542 | |||
2543 | raw_spin_lock(&rq->lock); | ||
2544 | |||
2545 | while (list) { | ||
2546 | struct task_struct *p = list; | ||
2547 | list = list->wake_entry; | ||
2548 | ttwu_do_activate(rq, p, 0); | ||
2549 | } | ||
2550 | |||
2551 | raw_spin_unlock(&rq->lock); | ||
2552 | } | ||
2553 | |||
2554 | void scheduler_ipi(void) | ||
2555 | { | ||
2556 | sched_ttwu_pending(); | ||
2557 | } | ||
2558 | |||
2559 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2560 | { | ||
2561 | struct rq *rq = cpu_rq(cpu); | ||
2562 | struct task_struct *next = rq->wake_list; | ||
2563 | |||
2564 | for (;;) { | ||
2565 | struct task_struct *old = next; | ||
2566 | |||
2567 | p->wake_entry = next; | ||
2568 | next = cmpxchg(&rq->wake_list, old, p); | ||
2569 | if (next == old) | ||
2570 | break; | ||
2571 | } | ||
2572 | |||
2573 | if (!next) | ||
2574 | smp_send_reschedule(cpu); | ||
2575 | } | ||
2576 | #endif | ||
2577 | |||
2578 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2579 | { | ||
2580 | struct rq *rq = cpu_rq(cpu); | ||
2581 | |||
2582 | #if defined(CONFIG_SMP) | ||
2583 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2584 | ttwu_queue_remote(p, cpu); | ||
2585 | return; | ||
2586 | } | ||
2587 | #endif | ||
2588 | |||
2589 | raw_spin_lock(&rq->lock); | ||
2590 | ttwu_do_activate(rq, p, 0); | ||
2591 | raw_spin_unlock(&rq->lock); | ||
2397 | } | 2592 | } |
2398 | 2593 | ||
2399 | /** | 2594 | /** |
@@ -2411,92 +2606,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2411 | * Returns %true if @p was woken up, %false if it was already running | 2606 | * Returns %true if @p was woken up, %false if it was already running |
2412 | * or @state didn't match @p's state. | 2607 | * or @state didn't match @p's state. |
2413 | */ | 2608 | */ |
2414 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2609 | static int |
2415 | int wake_flags) | 2610 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2416 | { | 2611 | { |
2417 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2418 | unsigned long flags; | 2612 | unsigned long flags; |
2419 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2613 | int cpu, success = 0; |
2420 | struct rq *rq; | ||
2421 | |||
2422 | this_cpu = get_cpu(); | ||
2423 | 2614 | ||
2424 | smp_wmb(); | 2615 | smp_wmb(); |
2425 | rq = task_rq_lock(p, &flags); | 2616 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2426 | if (!(p->state & state)) | 2617 | if (!(p->state & state)) |
2427 | goto out; | 2618 | goto out; |
2428 | 2619 | ||
2429 | if (p->se.on_rq) | 2620 | success = 1; /* we're going to change ->state */ |
2430 | goto out_running; | ||
2431 | |||
2432 | cpu = task_cpu(p); | 2621 | cpu = task_cpu(p); |
2433 | orig_cpu = cpu; | ||
2434 | 2622 | ||
2435 | #ifdef CONFIG_SMP | 2623 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2436 | if (unlikely(task_running(rq, p))) | 2624 | goto stat; |
2437 | goto out_activate; | ||
2438 | 2625 | ||
2626 | #ifdef CONFIG_SMP | ||
2439 | /* | 2627 | /* |
2440 | * In order to handle concurrent wakeups and release the rq->lock | 2628 | * If the owning (remote) cpu is still in the middle of schedule() with |
2441 | * we put the task in TASK_WAKING state. | 2629 | * this task as prev, wait until its done referencing the task. |
2442 | * | ||
2443 | * First fix up the nr_uninterruptible count: | ||
2444 | */ | 2630 | */ |
2445 | if (task_contributes_to_load(p)) { | 2631 | while (p->on_cpu) { |
2446 | if (likely(cpu_online(orig_cpu))) | 2632 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2447 | rq->nr_uninterruptible--; | 2633 | /* |
2448 | else | 2634 | * If called from interrupt context we could have landed in the |
2449 | this_rq()->nr_uninterruptible--; | 2635 | * middle of schedule(), in this case we should take care not |
2450 | } | 2636 | * to spin on ->on_cpu if p is current, since that would |
2451 | p->state = TASK_WAKING; | 2637 | * deadlock. |
2452 | 2638 | */ | |
2453 | if (p->sched_class->task_waking) { | 2639 | if (p == current) { |
2454 | p->sched_class->task_waking(rq, p); | 2640 | ttwu_queue(p, cpu); |
2455 | en_flags |= ENQUEUE_WAKING; | 2641 | goto stat; |
2642 | } | ||
2643 | #endif | ||
2644 | cpu_relax(); | ||
2456 | } | 2645 | } |
2457 | |||
2458 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | ||
2459 | if (cpu != orig_cpu) | ||
2460 | set_task_cpu(p, cpu); | ||
2461 | __task_rq_unlock(rq); | ||
2462 | |||
2463 | rq = cpu_rq(cpu); | ||
2464 | raw_spin_lock(&rq->lock); | ||
2465 | |||
2466 | /* | 2646 | /* |
2467 | * We migrated the task without holding either rq->lock, however | 2647 | * Pairs with the smp_wmb() in finish_lock_switch(). |
2468 | * since the task is not on the task list itself, nobody else | ||
2469 | * will try and migrate the task, hence the rq should match the | ||
2470 | * cpu we just moved it to. | ||
2471 | */ | 2648 | */ |
2472 | WARN_ON(task_cpu(p) != cpu); | 2649 | smp_rmb(); |
2473 | WARN_ON(p->state != TASK_WAKING); | ||
2474 | 2650 | ||
2475 | #ifdef CONFIG_SCHEDSTATS | 2651 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2476 | schedstat_inc(rq, ttwu_count); | 2652 | p->state = TASK_WAKING; |
2477 | if (cpu == this_cpu) | ||
2478 | schedstat_inc(rq, ttwu_local); | ||
2479 | else { | ||
2480 | struct sched_domain *sd; | ||
2481 | for_each_domain(this_cpu, sd) { | ||
2482 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2483 | schedstat_inc(sd, ttwu_wake_remote); | ||
2484 | break; | ||
2485 | } | ||
2486 | } | ||
2487 | } | ||
2488 | #endif /* CONFIG_SCHEDSTATS */ | ||
2489 | 2653 | ||
2490 | out_activate: | 2654 | if (p->sched_class->task_waking) |
2655 | p->sched_class->task_waking(p); | ||
2656 | |||
2657 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2658 | if (task_cpu(p) != cpu) | ||
2659 | set_task_cpu(p, cpu); | ||
2491 | #endif /* CONFIG_SMP */ | 2660 | #endif /* CONFIG_SMP */ |
2492 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2661 | |
2493 | cpu == this_cpu, en_flags); | 2662 | ttwu_queue(p, cpu); |
2494 | success = 1; | 2663 | stat: |
2495 | out_running: | 2664 | ttwu_stat(p, cpu, wake_flags); |
2496 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2497 | out: | 2665 | out: |
2498 | task_rq_unlock(rq, &flags); | 2666 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2499 | put_cpu(); | ||
2500 | 2667 | ||
2501 | return success; | 2668 | return success; |
2502 | } | 2669 | } |
@@ -2505,31 +2672,34 @@ out: | |||
2505 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2672 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2506 | * @p: the thread to be awakened | 2673 | * @p: the thread to be awakened |
2507 | * | 2674 | * |
2508 | * Put @p on the run-queue if it's not already there. The caller must | 2675 | * Put @p on the run-queue if it's not already there. The caller must |
2509 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2676 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2510 | * the current task. this_rq() stays locked over invocation. | 2677 | * the current task. |
2511 | */ | 2678 | */ |
2512 | static void try_to_wake_up_local(struct task_struct *p) | 2679 | static void try_to_wake_up_local(struct task_struct *p) |
2513 | { | 2680 | { |
2514 | struct rq *rq = task_rq(p); | 2681 | struct rq *rq = task_rq(p); |
2515 | bool success = false; | ||
2516 | 2682 | ||
2517 | BUG_ON(rq != this_rq()); | 2683 | BUG_ON(rq != this_rq()); |
2518 | BUG_ON(p == current); | 2684 | BUG_ON(p == current); |
2519 | lockdep_assert_held(&rq->lock); | 2685 | lockdep_assert_held(&rq->lock); |
2520 | 2686 | ||
2687 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2688 | raw_spin_unlock(&rq->lock); | ||
2689 | raw_spin_lock(&p->pi_lock); | ||
2690 | raw_spin_lock(&rq->lock); | ||
2691 | } | ||
2692 | |||
2521 | if (!(p->state & TASK_NORMAL)) | 2693 | if (!(p->state & TASK_NORMAL)) |
2522 | return; | 2694 | goto out; |
2523 | 2695 | ||
2524 | if (!p->se.on_rq) { | 2696 | if (!p->on_rq) |
2525 | if (likely(!task_running(rq, p))) { | 2697 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2526 | schedstat_inc(rq, ttwu_count); | 2698 | |
2527 | schedstat_inc(rq, ttwu_local); | 2699 | ttwu_do_wakeup(rq, p, 0); |
2528 | } | 2700 | ttwu_stat(p, smp_processor_id(), 0); |
2529 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2701 | out: |
2530 | success = true; | 2702 | raw_spin_unlock(&p->pi_lock); |
2531 | } | ||
2532 | ttwu_post_activation(p, rq, 0, success); | ||
2533 | } | 2703 | } |
2534 | 2704 | ||
2535 | /** | 2705 | /** |
@@ -2562,18 +2732,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2562 | */ | 2732 | */ |
2563 | static void __sched_fork(struct task_struct *p) | 2733 | static void __sched_fork(struct task_struct *p) |
2564 | { | 2734 | { |
2735 | p->on_rq = 0; | ||
2736 | |||
2737 | p->se.on_rq = 0; | ||
2565 | p->se.exec_start = 0; | 2738 | p->se.exec_start = 0; |
2566 | p->se.sum_exec_runtime = 0; | 2739 | p->se.sum_exec_runtime = 0; |
2567 | p->se.prev_sum_exec_runtime = 0; | 2740 | p->se.prev_sum_exec_runtime = 0; |
2568 | p->se.nr_migrations = 0; | 2741 | p->se.nr_migrations = 0; |
2742 | p->se.vruntime = 0; | ||
2743 | INIT_LIST_HEAD(&p->se.group_node); | ||
2569 | 2744 | ||
2570 | #ifdef CONFIG_SCHEDSTATS | 2745 | #ifdef CONFIG_SCHEDSTATS |
2571 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2746 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2572 | #endif | 2747 | #endif |
2573 | 2748 | ||
2574 | INIT_LIST_HEAD(&p->rt.run_list); | 2749 | INIT_LIST_HEAD(&p->rt.run_list); |
2575 | p->se.on_rq = 0; | ||
2576 | INIT_LIST_HEAD(&p->se.group_node); | ||
2577 | 2750 | ||
2578 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2751 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2579 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2752 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2583,8 +2756,9 @@ static void __sched_fork(struct task_struct *p) | |||
2583 | /* | 2756 | /* |
2584 | * fork()/clone()-time setup: | 2757 | * fork()/clone()-time setup: |
2585 | */ | 2758 | */ |
2586 | void sched_fork(struct task_struct *p, int clone_flags) | 2759 | void sched_fork(struct task_struct *p) |
2587 | { | 2760 | { |
2761 | unsigned long flags; | ||
2588 | int cpu = get_cpu(); | 2762 | int cpu = get_cpu(); |
2589 | 2763 | ||
2590 | __sched_fork(p); | 2764 | __sched_fork(p); |
@@ -2635,16 +2809,16 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2635 | * | 2809 | * |
2636 | * Silence PROVE_RCU. | 2810 | * Silence PROVE_RCU. |
2637 | */ | 2811 | */ |
2638 | rcu_read_lock(); | 2812 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2639 | set_task_cpu(p, cpu); | 2813 | set_task_cpu(p, cpu); |
2640 | rcu_read_unlock(); | 2814 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2641 | 2815 | ||
2642 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2816 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2643 | if (likely(sched_info_on())) | 2817 | if (likely(sched_info_on())) |
2644 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2818 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2645 | #endif | 2819 | #endif |
2646 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2820 | #if defined(CONFIG_SMP) |
2647 | p->oncpu = 0; | 2821 | p->on_cpu = 0; |
2648 | #endif | 2822 | #endif |
2649 | #ifdef CONFIG_PREEMPT | 2823 | #ifdef CONFIG_PREEMPT |
2650 | /* Want to start with kernel preemption disabled. */ | 2824 | /* Want to start with kernel preemption disabled. */ |
@@ -2664,41 +2838,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2664 | * that must be done for every newly created context, then puts the task | 2838 | * that must be done for every newly created context, then puts the task |
2665 | * on the runqueue and wakes it. | 2839 | * on the runqueue and wakes it. |
2666 | */ | 2840 | */ |
2667 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2841 | void wake_up_new_task(struct task_struct *p) |
2668 | { | 2842 | { |
2669 | unsigned long flags; | 2843 | unsigned long flags; |
2670 | struct rq *rq; | 2844 | struct rq *rq; |
2671 | int cpu __maybe_unused = get_cpu(); | ||
2672 | 2845 | ||
2846 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2673 | #ifdef CONFIG_SMP | 2847 | #ifdef CONFIG_SMP |
2674 | rq = task_rq_lock(p, &flags); | ||
2675 | p->state = TASK_WAKING; | ||
2676 | |||
2677 | /* | 2848 | /* |
2678 | * Fork balancing, do it here and not earlier because: | 2849 | * Fork balancing, do it here and not earlier because: |
2679 | * - cpus_allowed can change in the fork path | 2850 | * - cpus_allowed can change in the fork path |
2680 | * - any previously selected cpu might disappear through hotplug | 2851 | * - any previously selected cpu might disappear through hotplug |
2681 | * | ||
2682 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2683 | * without people poking at ->cpus_allowed. | ||
2684 | */ | 2852 | */ |
2685 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2853 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2686 | set_task_cpu(p, cpu); | ||
2687 | |||
2688 | p->state = TASK_RUNNING; | ||
2689 | task_rq_unlock(rq, &flags); | ||
2690 | #endif | 2854 | #endif |
2691 | 2855 | ||
2692 | rq = task_rq_lock(p, &flags); | 2856 | rq = __task_rq_lock(p); |
2693 | activate_task(rq, p, 0); | 2857 | activate_task(rq, p, 0); |
2694 | trace_sched_wakeup_new(p, 1); | 2858 | p->on_rq = 1; |
2859 | trace_sched_wakeup_new(p, true); | ||
2695 | check_preempt_curr(rq, p, WF_FORK); | 2860 | check_preempt_curr(rq, p, WF_FORK); |
2696 | #ifdef CONFIG_SMP | 2861 | #ifdef CONFIG_SMP |
2697 | if (p->sched_class->task_woken) | 2862 | if (p->sched_class->task_woken) |
2698 | p->sched_class->task_woken(rq, p); | 2863 | p->sched_class->task_woken(rq, p); |
2699 | #endif | 2864 | #endif |
2700 | task_rq_unlock(rq, &flags); | 2865 | task_rq_unlock(rq, p, &flags); |
2701 | put_cpu(); | ||
2702 | } | 2866 | } |
2703 | 2867 | ||
2704 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2868 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -2776,9 +2940,12 @@ static inline void | |||
2776 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2940 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2777 | struct task_struct *next) | 2941 | struct task_struct *next) |
2778 | { | 2942 | { |
2943 | sched_info_switch(prev, next); | ||
2944 | perf_event_task_sched_out(prev, next); | ||
2779 | fire_sched_out_preempt_notifiers(prev, next); | 2945 | fire_sched_out_preempt_notifiers(prev, next); |
2780 | prepare_lock_switch(rq, next); | 2946 | prepare_lock_switch(rq, next); |
2781 | prepare_arch_switch(next); | 2947 | prepare_arch_switch(next); |
2948 | trace_sched_switch(prev, next); | ||
2782 | } | 2949 | } |
2783 | 2950 | ||
2784 | /** | 2951 | /** |
@@ -2911,7 +3078,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2911 | struct mm_struct *mm, *oldmm; | 3078 | struct mm_struct *mm, *oldmm; |
2912 | 3079 | ||
2913 | prepare_task_switch(rq, prev, next); | 3080 | prepare_task_switch(rq, prev, next); |
2914 | trace_sched_switch(prev, next); | 3081 | |
2915 | mm = next->mm; | 3082 | mm = next->mm; |
2916 | oldmm = prev->active_mm; | 3083 | oldmm = prev->active_mm; |
2917 | /* | 3084 | /* |
@@ -3404,27 +3571,22 @@ void sched_exec(void) | |||
3404 | { | 3571 | { |
3405 | struct task_struct *p = current; | 3572 | struct task_struct *p = current; |
3406 | unsigned long flags; | 3573 | unsigned long flags; |
3407 | struct rq *rq; | ||
3408 | int dest_cpu; | 3574 | int dest_cpu; |
3409 | 3575 | ||
3410 | rq = task_rq_lock(p, &flags); | 3576 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3411 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3577 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3412 | if (dest_cpu == smp_processor_id()) | 3578 | if (dest_cpu == smp_processor_id()) |
3413 | goto unlock; | 3579 | goto unlock; |
3414 | 3580 | ||
3415 | /* | 3581 | if (likely(cpu_active(dest_cpu))) { |
3416 | * select_task_rq() can race against ->cpus_allowed | ||
3417 | */ | ||
3418 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3419 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { | ||
3420 | struct migration_arg arg = { p, dest_cpu }; | 3582 | struct migration_arg arg = { p, dest_cpu }; |
3421 | 3583 | ||
3422 | task_rq_unlock(rq, &flags); | 3584 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3423 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3585 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3424 | return; | 3586 | return; |
3425 | } | 3587 | } |
3426 | unlock: | 3588 | unlock: |
3427 | task_rq_unlock(rq, &flags); | 3589 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3428 | } | 3590 | } |
3429 | 3591 | ||
3430 | #endif | 3592 | #endif |
@@ -3461,7 +3623,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3461 | 3623 | ||
3462 | rq = task_rq_lock(p, &flags); | 3624 | rq = task_rq_lock(p, &flags); |
3463 | ns = do_task_delta_exec(p, rq); | 3625 | ns = do_task_delta_exec(p, rq); |
3464 | task_rq_unlock(rq, &flags); | 3626 | task_rq_unlock(rq, p, &flags); |
3465 | 3627 | ||
3466 | return ns; | 3628 | return ns; |
3467 | } | 3629 | } |
@@ -3479,7 +3641,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3479 | 3641 | ||
3480 | rq = task_rq_lock(p, &flags); | 3642 | rq = task_rq_lock(p, &flags); |
3481 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3643 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3482 | task_rq_unlock(rq, &flags); | 3644 | task_rq_unlock(rq, p, &flags); |
3483 | 3645 | ||
3484 | return ns; | 3646 | return ns; |
3485 | } | 3647 | } |
@@ -3503,7 +3665,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3503 | rq = task_rq_lock(p, &flags); | 3665 | rq = task_rq_lock(p, &flags); |
3504 | thread_group_cputime(p, &totals); | 3666 | thread_group_cputime(p, &totals); |
3505 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3667 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3506 | task_rq_unlock(rq, &flags); | 3668 | task_rq_unlock(rq, p, &flags); |
3507 | 3669 | ||
3508 | return ns; | 3670 | return ns; |
3509 | } | 3671 | } |
@@ -3568,6 +3730,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3568 | } | 3730 | } |
3569 | 3731 | ||
3570 | /* | 3732 | /* |
3733 | * Account system cpu time to a process and desired cpustat field | ||
3734 | * @p: the process that the cpu time gets accounted to | ||
3735 | * @cputime: the cpu time spent in kernel space since the last update | ||
3736 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3737 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3738 | */ | ||
3739 | static inline | ||
3740 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3741 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3742 | { | ||
3743 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3744 | |||
3745 | /* Add system time to process. */ | ||
3746 | p->stime = cputime_add(p->stime, cputime); | ||
3747 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3748 | account_group_system_time(p, cputime); | ||
3749 | |||
3750 | /* Add system time to cpustat. */ | ||
3751 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3752 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3753 | |||
3754 | /* Account for system time used */ | ||
3755 | acct_update_integrals(p); | ||
3756 | } | ||
3757 | |||
3758 | /* | ||
3571 | * Account system cpu time to a process. | 3759 | * Account system cpu time to a process. |
3572 | * @p: the process that the cpu time gets accounted to | 3760 | * @p: the process that the cpu time gets accounted to |
3573 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3761 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3578,36 +3766,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3578 | cputime_t cputime, cputime_t cputime_scaled) | 3766 | cputime_t cputime, cputime_t cputime_scaled) |
3579 | { | 3767 | { |
3580 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3768 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3581 | cputime64_t tmp; | 3769 | cputime64_t *target_cputime64; |
3582 | 3770 | ||
3583 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3771 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3584 | account_guest_time(p, cputime, cputime_scaled); | 3772 | account_guest_time(p, cputime, cputime_scaled); |
3585 | return; | 3773 | return; |
3586 | } | 3774 | } |
3587 | 3775 | ||
3588 | /* Add system time to process. */ | ||
3589 | p->stime = cputime_add(p->stime, cputime); | ||
3590 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3591 | account_group_system_time(p, cputime); | ||
3592 | |||
3593 | /* Add system time to cpustat. */ | ||
3594 | tmp = cputime_to_cputime64(cputime); | ||
3595 | if (hardirq_count() - hardirq_offset) | 3776 | if (hardirq_count() - hardirq_offset) |
3596 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3777 | target_cputime64 = &cpustat->irq; |
3597 | else if (in_serving_softirq()) | 3778 | else if (in_serving_softirq()) |
3598 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3779 | target_cputime64 = &cpustat->softirq; |
3599 | else | 3780 | else |
3600 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3781 | target_cputime64 = &cpustat->system; |
3601 | 3782 | ||
3602 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | 3783 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3603 | |||
3604 | /* Account for system time used */ | ||
3605 | acct_update_integrals(p); | ||
3606 | } | 3784 | } |
3607 | 3785 | ||
3608 | /* | 3786 | /* |
3609 | * Account for involuntary wait time. | 3787 | * Account for involuntary wait time. |
3610 | * @steal: the cpu time spent in involuntary wait | 3788 | * @cputime: the cpu time spent in involuntary wait |
3611 | */ | 3789 | */ |
3612 | void account_steal_time(cputime_t cputime) | 3790 | void account_steal_time(cputime_t cputime) |
3613 | { | 3791 | { |
@@ -3635,6 +3813,73 @@ void account_idle_time(cputime_t cputime) | |||
3635 | 3813 | ||
3636 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3814 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3637 | 3815 | ||
3816 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3817 | /* | ||
3818 | * Account a tick to a process and cpustat | ||
3819 | * @p: the process that the cpu time gets accounted to | ||
3820 | * @user_tick: is the tick from userspace | ||
3821 | * @rq: the pointer to rq | ||
3822 | * | ||
3823 | * Tick demultiplexing follows the order | ||
3824 | * - pending hardirq update | ||
3825 | * - pending softirq update | ||
3826 | * - user_time | ||
3827 | * - idle_time | ||
3828 | * - system time | ||
3829 | * - check for guest_time | ||
3830 | * - else account as system_time | ||
3831 | * | ||
3832 | * Check for hardirq is done both for system and user time as there is | ||
3833 | * no timer going off while we are on hardirq and hence we may never get an | ||
3834 | * opportunity to update it solely in system time. | ||
3835 | * p->stime and friends are only updated on system time and not on irq | ||
3836 | * softirq as those do not count in task exec_runtime any more. | ||
3837 | */ | ||
3838 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3839 | struct rq *rq) | ||
3840 | { | ||
3841 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3842 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3843 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3844 | |||
3845 | if (irqtime_account_hi_update()) { | ||
3846 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3847 | } else if (irqtime_account_si_update()) { | ||
3848 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3849 | } else if (this_cpu_ksoftirqd() == p) { | ||
3850 | /* | ||
3851 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3852 | * So, we have to handle it separately here. | ||
3853 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3854 | */ | ||
3855 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3856 | &cpustat->softirq); | ||
3857 | } else if (user_tick) { | ||
3858 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3859 | } else if (p == rq->idle) { | ||
3860 | account_idle_time(cputime_one_jiffy); | ||
3861 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3862 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3863 | } else { | ||
3864 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3865 | &cpustat->system); | ||
3866 | } | ||
3867 | } | ||
3868 | |||
3869 | static void irqtime_account_idle_ticks(int ticks) | ||
3870 | { | ||
3871 | int i; | ||
3872 | struct rq *rq = this_rq(); | ||
3873 | |||
3874 | for (i = 0; i < ticks; i++) | ||
3875 | irqtime_account_process_tick(current, 0, rq); | ||
3876 | } | ||
3877 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3878 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3879 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3880 | struct rq *rq) {} | ||
3881 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3882 | |||
3638 | /* | 3883 | /* |
3639 | * Account a single tick of cpu time. | 3884 | * Account a single tick of cpu time. |
3640 | * @p: the process that the cpu time gets accounted to | 3885 | * @p: the process that the cpu time gets accounted to |
@@ -3645,6 +3890,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3645 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3890 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3646 | struct rq *rq = this_rq(); | 3891 | struct rq *rq = this_rq(); |
3647 | 3892 | ||
3893 | if (sched_clock_irqtime) { | ||
3894 | irqtime_account_process_tick(p, user_tick, rq); | ||
3895 | return; | ||
3896 | } | ||
3897 | |||
3648 | if (user_tick) | 3898 | if (user_tick) |
3649 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3899 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3650 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3900 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3670,6 +3920,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3670 | */ | 3920 | */ |
3671 | void account_idle_ticks(unsigned long ticks) | 3921 | void account_idle_ticks(unsigned long ticks) |
3672 | { | 3922 | { |
3923 | |||
3924 | if (sched_clock_irqtime) { | ||
3925 | irqtime_account_idle_ticks(ticks); | ||
3926 | return; | ||
3927 | } | ||
3928 | |||
3673 | account_idle_time(jiffies_to_cputime(ticks)); | 3929 | account_idle_time(jiffies_to_cputime(ticks)); |
3674 | } | 3930 | } |
3675 | 3931 | ||
@@ -3763,9 +4019,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3763 | /* | 4019 | /* |
3764 | * This function gets called by the timer code, with HZ frequency. | 4020 | * This function gets called by the timer code, with HZ frequency. |
3765 | * We call it with interrupts disabled. | 4021 | * We call it with interrupts disabled. |
3766 | * | ||
3767 | * It also gets called by the fork code, when changing the parent's | ||
3768 | * timeslices. | ||
3769 | */ | 4022 | */ |
3770 | void scheduler_tick(void) | 4023 | void scheduler_tick(void) |
3771 | { | 4024 | { |
@@ -3885,17 +4138,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3885 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4138 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3886 | 4139 | ||
3887 | schedstat_inc(this_rq(), sched_count); | 4140 | schedstat_inc(this_rq(), sched_count); |
3888 | #ifdef CONFIG_SCHEDSTATS | ||
3889 | if (unlikely(prev->lock_depth >= 0)) { | ||
3890 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); | ||
3891 | schedstat_inc(prev, sched_info.bkl_count); | ||
3892 | } | ||
3893 | #endif | ||
3894 | } | 4141 | } |
3895 | 4142 | ||
3896 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4143 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
3897 | { | 4144 | { |
3898 | if (prev->se.on_rq) | 4145 | if (prev->on_rq || rq->skip_clock_update < 0) |
3899 | update_rq_clock(rq); | 4146 | update_rq_clock(rq); |
3900 | prev->sched_class->put_prev_task(rq, prev); | 4147 | prev->sched_class->put_prev_task(rq, prev); |
3901 | } | 4148 | } |
@@ -3945,9 +4192,6 @@ need_resched: | |||
3945 | rcu_note_context_switch(cpu); | 4192 | rcu_note_context_switch(cpu); |
3946 | prev = rq->curr; | 4193 | prev = rq->curr; |
3947 | 4194 | ||
3948 | release_kernel_lock(prev); | ||
3949 | need_resched_nonpreemptible: | ||
3950 | |||
3951 | schedule_debug(prev); | 4195 | schedule_debug(prev); |
3952 | 4196 | ||
3953 | if (sched_feat(HRTICK)) | 4197 | if (sched_feat(HRTICK)) |
@@ -3960,11 +4204,13 @@ need_resched_nonpreemptible: | |||
3960 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4204 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3961 | prev->state = TASK_RUNNING; | 4205 | prev->state = TASK_RUNNING; |
3962 | } else { | 4206 | } else { |
4207 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4208 | prev->on_rq = 0; | ||
4209 | |||
3963 | /* | 4210 | /* |
3964 | * If a worker is going to sleep, notify and | 4211 | * If a worker went to sleep, notify and ask workqueue |
3965 | * ask workqueue whether it wants to wake up a | 4212 | * whether it wants to wake up a task to maintain |
3966 | * task to maintain concurrency. If so, wake | 4213 | * concurrency. |
3967 | * up the task. | ||
3968 | */ | 4214 | */ |
3969 | if (prev->flags & PF_WQ_WORKER) { | 4215 | if (prev->flags & PF_WQ_WORKER) { |
3970 | struct task_struct *to_wakeup; | 4216 | struct task_struct *to_wakeup; |
@@ -3973,7 +4219,16 @@ need_resched_nonpreemptible: | |||
3973 | if (to_wakeup) | 4219 | if (to_wakeup) |
3974 | try_to_wake_up_local(to_wakeup); | 4220 | try_to_wake_up_local(to_wakeup); |
3975 | } | 4221 | } |
3976 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4222 | |
4223 | /* | ||
4224 | * If we are going to sleep and we have plugged IO | ||
4225 | * queued, make sure to submit it to avoid deadlocks. | ||
4226 | */ | ||
4227 | if (blk_needs_flush_plug(prev)) { | ||
4228 | raw_spin_unlock(&rq->lock); | ||
4229 | blk_schedule_flush_plug(prev); | ||
4230 | raw_spin_lock(&rq->lock); | ||
4231 | } | ||
3977 | } | 4232 | } |
3978 | switch_count = &prev->nvcsw; | 4233 | switch_count = &prev->nvcsw; |
3979 | } | 4234 | } |
@@ -3989,9 +4244,6 @@ need_resched_nonpreemptible: | |||
3989 | rq->skip_clock_update = 0; | 4244 | rq->skip_clock_update = 0; |
3990 | 4245 | ||
3991 | if (likely(prev != next)) { | 4246 | if (likely(prev != next)) { |
3992 | sched_info_switch(prev, next); | ||
3993 | perf_event_task_sched_out(prev, next); | ||
3994 | |||
3995 | rq->nr_switches++; | 4247 | rq->nr_switches++; |
3996 | rq->curr = next; | 4248 | rq->curr = next; |
3997 | ++*switch_count; | 4249 | ++*switch_count; |
@@ -4010,9 +4262,6 @@ need_resched_nonpreemptible: | |||
4010 | 4262 | ||
4011 | post_schedule(rq); | 4263 | post_schedule(rq); |
4012 | 4264 | ||
4013 | if (unlikely(reacquire_kernel_lock(prev))) | ||
4014 | goto need_resched_nonpreemptible; | ||
4015 | |||
4016 | preempt_enable_no_resched(); | 4265 | preempt_enable_no_resched(); |
4017 | if (need_resched()) | 4266 | if (need_resched()) |
4018 | goto need_resched; | 4267 | goto need_resched; |
@@ -4020,70 +4269,53 @@ need_resched_nonpreemptible: | |||
4020 | EXPORT_SYMBOL(schedule); | 4269 | EXPORT_SYMBOL(schedule); |
4021 | 4270 | ||
4022 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4271 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4023 | /* | ||
4024 | * Look out! "owner" is an entirely speculative pointer | ||
4025 | * access and not reliable. | ||
4026 | */ | ||
4027 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | ||
4028 | { | ||
4029 | unsigned int cpu; | ||
4030 | struct rq *rq; | ||
4031 | 4272 | ||
4032 | if (!sched_feat(OWNER_SPIN)) | 4273 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4033 | return 0; | 4274 | { |
4275 | bool ret = false; | ||
4034 | 4276 | ||
4035 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4277 | rcu_read_lock(); |
4036 | /* | 4278 | if (lock->owner != owner) |
4037 | * Need to access the cpu field knowing that | 4279 | goto fail; |
4038 | * DEBUG_PAGEALLOC could have unmapped it if | ||
4039 | * the mutex owner just released it and exited. | ||
4040 | */ | ||
4041 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
4042 | return 0; | ||
4043 | #else | ||
4044 | cpu = owner->cpu; | ||
4045 | #endif | ||
4046 | 4280 | ||
4047 | /* | 4281 | /* |
4048 | * Even if the access succeeded (likely case), | 4282 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
4049 | * the cpu field may no longer be valid. | 4283 | * lock->owner still matches owner, if that fails, owner might |
4284 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4285 | * ensures the memory stays valid. | ||
4050 | */ | 4286 | */ |
4051 | if (cpu >= nr_cpumask_bits) | 4287 | barrier(); |
4052 | return 0; | ||
4053 | 4288 | ||
4054 | /* | 4289 | ret = owner->on_cpu; |
4055 | * We need to validate that we can do a | 4290 | fail: |
4056 | * get_cpu() and that we have the percpu area. | 4291 | rcu_read_unlock(); |
4057 | */ | ||
4058 | if (!cpu_online(cpu)) | ||
4059 | return 0; | ||
4060 | 4292 | ||
4061 | rq = cpu_rq(cpu); | 4293 | return ret; |
4294 | } | ||
4062 | 4295 | ||
4063 | for (;;) { | 4296 | /* |
4064 | /* | 4297 | * Look out! "owner" is an entirely speculative pointer |
4065 | * Owner changed, break to re-assess state. | 4298 | * access and not reliable. |
4066 | */ | 4299 | */ |
4067 | if (lock->owner != owner) { | 4300 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
4068 | /* | 4301 | { |
4069 | * If the lock has switched to a different owner, | 4302 | if (!sched_feat(OWNER_SPIN)) |
4070 | * we likely have heavy contention. Return 0 to quit | 4303 | return 0; |
4071 | * optimistic spinning and not contend further: | ||
4072 | */ | ||
4073 | if (lock->owner) | ||
4074 | return 0; | ||
4075 | break; | ||
4076 | } | ||
4077 | 4304 | ||
4078 | /* | 4305 | while (owner_running(lock, owner)) { |
4079 | * Is that owner really running on that cpu? | 4306 | if (need_resched()) |
4080 | */ | ||
4081 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
4082 | return 0; | 4307 | return 0; |
4083 | 4308 | ||
4084 | arch_mutex_cpu_relax(); | 4309 | arch_mutex_cpu_relax(); |
4085 | } | 4310 | } |
4086 | 4311 | ||
4312 | /* | ||
4313 | * If the owner changed to another task there is likely | ||
4314 | * heavy contention, stop spinning. | ||
4315 | */ | ||
4316 | if (lock->owner) | ||
4317 | return 0; | ||
4318 | |||
4087 | return 1; | 4319 | return 1; |
4088 | } | 4320 | } |
4089 | #endif | 4321 | #endif |
@@ -4213,6 +4445,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4213 | { | 4445 | { |
4214 | __wake_up_common(q, mode, 1, 0, key); | 4446 | __wake_up_common(q, mode, 1, 0, key); |
4215 | } | 4447 | } |
4448 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4216 | 4449 | ||
4217 | /** | 4450 | /** |
4218 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4451 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4542,19 +4775,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4542 | */ | 4775 | */ |
4543 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4776 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4544 | { | 4777 | { |
4545 | unsigned long flags; | ||
4546 | int oldprio, on_rq, running; | 4778 | int oldprio, on_rq, running; |
4547 | struct rq *rq; | 4779 | struct rq *rq; |
4548 | const struct sched_class *prev_class; | 4780 | const struct sched_class *prev_class; |
4549 | 4781 | ||
4550 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4782 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4551 | 4783 | ||
4552 | rq = task_rq_lock(p, &flags); | 4784 | rq = __task_rq_lock(p); |
4553 | 4785 | ||
4554 | trace_sched_pi_setprio(p, prio); | 4786 | trace_sched_pi_setprio(p, prio); |
4555 | oldprio = p->prio; | 4787 | oldprio = p->prio; |
4556 | prev_class = p->sched_class; | 4788 | prev_class = p->sched_class; |
4557 | on_rq = p->se.on_rq; | 4789 | on_rq = p->on_rq; |
4558 | running = task_current(rq, p); | 4790 | running = task_current(rq, p); |
4559 | if (on_rq) | 4791 | if (on_rq) |
4560 | dequeue_task(rq, p, 0); | 4792 | dequeue_task(rq, p, 0); |
@@ -4570,12 +4802,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4570 | 4802 | ||
4571 | if (running) | 4803 | if (running) |
4572 | p->sched_class->set_curr_task(rq); | 4804 | p->sched_class->set_curr_task(rq); |
4573 | if (on_rq) { | 4805 | if (on_rq) |
4574 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4806 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4575 | 4807 | ||
4576 | check_class_changed(rq, p, prev_class, oldprio, running); | 4808 | check_class_changed(rq, p, prev_class, oldprio); |
4577 | } | 4809 | __task_rq_unlock(rq); |
4578 | task_rq_unlock(rq, &flags); | ||
4579 | } | 4810 | } |
4580 | 4811 | ||
4581 | #endif | 4812 | #endif |
@@ -4603,7 +4834,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4603 | p->static_prio = NICE_TO_PRIO(nice); | 4834 | p->static_prio = NICE_TO_PRIO(nice); |
4604 | goto out_unlock; | 4835 | goto out_unlock; |
4605 | } | 4836 | } |
4606 | on_rq = p->se.on_rq; | 4837 | on_rq = p->on_rq; |
4607 | if (on_rq) | 4838 | if (on_rq) |
4608 | dequeue_task(rq, p, 0); | 4839 | dequeue_task(rq, p, 0); |
4609 | 4840 | ||
@@ -4623,7 +4854,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4623 | resched_task(rq->curr); | 4854 | resched_task(rq->curr); |
4624 | } | 4855 | } |
4625 | out_unlock: | 4856 | out_unlock: |
4626 | task_rq_unlock(rq, &flags); | 4857 | task_rq_unlock(rq, p, &flags); |
4627 | } | 4858 | } |
4628 | EXPORT_SYMBOL(set_user_nice); | 4859 | EXPORT_SYMBOL(set_user_nice); |
4629 | 4860 | ||
@@ -4737,8 +4968,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4737 | static void | 4968 | static void |
4738 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 4969 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4739 | { | 4970 | { |
4740 | BUG_ON(p->se.on_rq); | ||
4741 | |||
4742 | p->policy = policy; | 4971 | p->policy = policy; |
4743 | p->rt_priority = prio; | 4972 | p->rt_priority = prio; |
4744 | p->normal_prio = normal_prio(p); | 4973 | p->normal_prio = normal_prio(p); |
@@ -4761,8 +4990,11 @@ static bool check_same_owner(struct task_struct *p) | |||
4761 | 4990 | ||
4762 | rcu_read_lock(); | 4991 | rcu_read_lock(); |
4763 | pcred = __task_cred(p); | 4992 | pcred = __task_cred(p); |
4764 | match = (cred->euid == pcred->euid || | 4993 | if (cred->user->user_ns == pcred->user->user_ns) |
4765 | cred->euid == pcred->uid); | 4994 | match = (cred->euid == pcred->euid || |
4995 | cred->euid == pcred->uid); | ||
4996 | else | ||
4997 | match = false; | ||
4766 | rcu_read_unlock(); | 4998 | rcu_read_unlock(); |
4767 | return match; | 4999 | return match; |
4768 | } | 5000 | } |
@@ -4822,12 +5054,15 @@ recheck: | |||
4822 | param->sched_priority > rlim_rtprio) | 5054 | param->sched_priority > rlim_rtprio) |
4823 | return -EPERM; | 5055 | return -EPERM; |
4824 | } | 5056 | } |
5057 | |||
4825 | /* | 5058 | /* |
4826 | * Like positive nice levels, dont allow tasks to | 5059 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4827 | * move out of SCHED_IDLE either: | 5060 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4828 | */ | 5061 | */ |
4829 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 5062 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4830 | return -EPERM; | 5063 | if (!can_nice(p, TASK_NICE(p))) |
5064 | return -EPERM; | ||
5065 | } | ||
4831 | 5066 | ||
4832 | /* can't change other user's priorities */ | 5067 | /* can't change other user's priorities */ |
4833 | if (!check_same_owner(p)) | 5068 | if (!check_same_owner(p)) |
@@ -4847,21 +5082,29 @@ recheck: | |||
4847 | /* | 5082 | /* |
4848 | * make sure no PI-waiters arrive (or leave) while we are | 5083 | * make sure no PI-waiters arrive (or leave) while we are |
4849 | * changing the priority of the task: | 5084 | * changing the priority of the task: |
4850 | */ | 5085 | * |
4851 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5086 | * To be able to change p->policy safely, the appropriate |
4852 | /* | ||
4853 | * To be able to change p->policy safely, the apropriate | ||
4854 | * runqueue lock must be held. | 5087 | * runqueue lock must be held. |
4855 | */ | 5088 | */ |
4856 | rq = __task_rq_lock(p); | 5089 | rq = task_rq_lock(p, &flags); |
4857 | 5090 | ||
4858 | /* | 5091 | /* |
4859 | * Changing the policy of the stop threads its a very bad idea | 5092 | * Changing the policy of the stop threads its a very bad idea |
4860 | */ | 5093 | */ |
4861 | if (p == rq->stop) { | 5094 | if (p == rq->stop) { |
5095 | task_rq_unlock(rq, p, &flags); | ||
5096 | return -EINVAL; | ||
5097 | } | ||
5098 | |||
5099 | /* | ||
5100 | * If not changing anything there's no need to proceed further: | ||
5101 | */ | ||
5102 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | ||
5103 | param->sched_priority == p->rt_priority))) { | ||
5104 | |||
4862 | __task_rq_unlock(rq); | 5105 | __task_rq_unlock(rq); |
4863 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5106 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4864 | return -EINVAL; | 5107 | return 0; |
4865 | } | 5108 | } |
4866 | 5109 | ||
4867 | #ifdef CONFIG_RT_GROUP_SCHED | 5110 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -4873,8 +5116,7 @@ recheck: | |||
4873 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5116 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4874 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5117 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4875 | !task_group_is_autogroup(task_group(p))) { | 5118 | !task_group_is_autogroup(task_group(p))) { |
4876 | __task_rq_unlock(rq); | 5119 | task_rq_unlock(rq, p, &flags); |
4877 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4878 | return -EPERM; | 5120 | return -EPERM; |
4879 | } | 5121 | } |
4880 | } | 5122 | } |
@@ -4883,11 +5125,10 @@ recheck: | |||
4883 | /* recheck policy now with rq lock held */ | 5125 | /* recheck policy now with rq lock held */ |
4884 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5126 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4885 | policy = oldpolicy = -1; | 5127 | policy = oldpolicy = -1; |
4886 | __task_rq_unlock(rq); | 5128 | task_rq_unlock(rq, p, &flags); |
4887 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4888 | goto recheck; | 5129 | goto recheck; |
4889 | } | 5130 | } |
4890 | on_rq = p->se.on_rq; | 5131 | on_rq = p->on_rq; |
4891 | running = task_current(rq, p); | 5132 | running = task_current(rq, p); |
4892 | if (on_rq) | 5133 | if (on_rq) |
4893 | deactivate_task(rq, p, 0); | 5134 | deactivate_task(rq, p, 0); |
@@ -4902,13 +5143,11 @@ recheck: | |||
4902 | 5143 | ||
4903 | if (running) | 5144 | if (running) |
4904 | p->sched_class->set_curr_task(rq); | 5145 | p->sched_class->set_curr_task(rq); |
4905 | if (on_rq) { | 5146 | if (on_rq) |
4906 | activate_task(rq, p, 0); | 5147 | activate_task(rq, p, 0); |
4907 | 5148 | ||
4908 | check_class_changed(rq, p, prev_class, oldprio, running); | 5149 | check_class_changed(rq, p, prev_class, oldprio); |
4909 | } | 5150 | task_rq_unlock(rq, p, &flags); |
4910 | __task_rq_unlock(rq); | ||
4911 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4912 | 5151 | ||
4913 | rt_mutex_adjust_pi(p); | 5152 | rt_mutex_adjust_pi(p); |
4914 | 5153 | ||
@@ -5088,7 +5327,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
5088 | goto out_free_cpus_allowed; | 5327 | goto out_free_cpus_allowed; |
5089 | } | 5328 | } |
5090 | retval = -EPERM; | 5329 | retval = -EPERM; |
5091 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5330 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
5092 | goto out_unlock; | 5331 | goto out_unlock; |
5093 | 5332 | ||
5094 | retval = security_task_setscheduler(p); | 5333 | retval = security_task_setscheduler(p); |
@@ -5159,7 +5398,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5159 | { | 5398 | { |
5160 | struct task_struct *p; | 5399 | struct task_struct *p; |
5161 | unsigned long flags; | 5400 | unsigned long flags; |
5162 | struct rq *rq; | ||
5163 | int retval; | 5401 | int retval; |
5164 | 5402 | ||
5165 | get_online_cpus(); | 5403 | get_online_cpus(); |
@@ -5174,9 +5412,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5174 | if (retval) | 5412 | if (retval) |
5175 | goto out_unlock; | 5413 | goto out_unlock; |
5176 | 5414 | ||
5177 | rq = task_rq_lock(p, &flags); | 5415 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5178 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5416 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5179 | task_rq_unlock(rq, &flags); | 5417 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5180 | 5418 | ||
5181 | out_unlock: | 5419 | out_unlock: |
5182 | rcu_read_unlock(); | 5420 | rcu_read_unlock(); |
@@ -5323,6 +5561,67 @@ void __sched yield(void) | |||
5323 | } | 5561 | } |
5324 | EXPORT_SYMBOL(yield); | 5562 | EXPORT_SYMBOL(yield); |
5325 | 5563 | ||
5564 | /** | ||
5565 | * yield_to - yield the current processor to another thread in | ||
5566 | * your thread group, or accelerate that thread toward the | ||
5567 | * processor it's on. | ||
5568 | * @p: target task | ||
5569 | * @preempt: whether task preemption is allowed or not | ||
5570 | * | ||
5571 | * It's the caller's job to ensure that the target task struct | ||
5572 | * can't go away on us before we can do any checks. | ||
5573 | * | ||
5574 | * Returns true if we indeed boosted the target task. | ||
5575 | */ | ||
5576 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5577 | { | ||
5578 | struct task_struct *curr = current; | ||
5579 | struct rq *rq, *p_rq; | ||
5580 | unsigned long flags; | ||
5581 | bool yielded = 0; | ||
5582 | |||
5583 | local_irq_save(flags); | ||
5584 | rq = this_rq(); | ||
5585 | |||
5586 | again: | ||
5587 | p_rq = task_rq(p); | ||
5588 | double_rq_lock(rq, p_rq); | ||
5589 | while (task_rq(p) != p_rq) { | ||
5590 | double_rq_unlock(rq, p_rq); | ||
5591 | goto again; | ||
5592 | } | ||
5593 | |||
5594 | if (!curr->sched_class->yield_to_task) | ||
5595 | goto out; | ||
5596 | |||
5597 | if (curr->sched_class != p->sched_class) | ||
5598 | goto out; | ||
5599 | |||
5600 | if (task_running(p_rq, p) || p->state) | ||
5601 | goto out; | ||
5602 | |||
5603 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5604 | if (yielded) { | ||
5605 | schedstat_inc(rq, yld_count); | ||
5606 | /* | ||
5607 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5608 | * fairness. | ||
5609 | */ | ||
5610 | if (preempt && rq != p_rq) | ||
5611 | resched_task(p_rq->curr); | ||
5612 | } | ||
5613 | |||
5614 | out: | ||
5615 | double_rq_unlock(rq, p_rq); | ||
5616 | local_irq_restore(flags); | ||
5617 | |||
5618 | if (yielded) | ||
5619 | schedule(); | ||
5620 | |||
5621 | return yielded; | ||
5622 | } | ||
5623 | EXPORT_SYMBOL_GPL(yield_to); | ||
5624 | |||
5326 | /* | 5625 | /* |
5327 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5626 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5328 | * that process accounting knows that this is a task in IO wait state. | 5627 | * that process accounting knows that this is a task in IO wait state. |
@@ -5333,6 +5632,7 @@ void __sched io_schedule(void) | |||
5333 | 5632 | ||
5334 | delayacct_blkio_start(); | 5633 | delayacct_blkio_start(); |
5335 | atomic_inc(&rq->nr_iowait); | 5634 | atomic_inc(&rq->nr_iowait); |
5635 | blk_flush_plug(current); | ||
5336 | current->in_iowait = 1; | 5636 | current->in_iowait = 1; |
5337 | schedule(); | 5637 | schedule(); |
5338 | current->in_iowait = 0; | 5638 | current->in_iowait = 0; |
@@ -5348,6 +5648,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5348 | 5648 | ||
5349 | delayacct_blkio_start(); | 5649 | delayacct_blkio_start(); |
5350 | atomic_inc(&rq->nr_iowait); | 5650 | atomic_inc(&rq->nr_iowait); |
5651 | blk_flush_plug(current); | ||
5351 | current->in_iowait = 1; | 5652 | current->in_iowait = 1; |
5352 | ret = schedule_timeout(timeout); | 5653 | ret = schedule_timeout(timeout); |
5353 | current->in_iowait = 0; | 5654 | current->in_iowait = 0; |
@@ -5438,7 +5739,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5438 | 5739 | ||
5439 | rq = task_rq_lock(p, &flags); | 5740 | rq = task_rq_lock(p, &flags); |
5440 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5741 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5441 | task_rq_unlock(rq, &flags); | 5742 | task_rq_unlock(rq, p, &flags); |
5442 | 5743 | ||
5443 | rcu_read_unlock(); | 5744 | rcu_read_unlock(); |
5444 | jiffies_to_timespec(time_slice, &t); | 5745 | jiffies_to_timespec(time_slice, &t); |
@@ -5496,7 +5797,7 @@ void show_state_filter(unsigned long state_filter) | |||
5496 | do_each_thread(g, p) { | 5797 | do_each_thread(g, p) { |
5497 | /* | 5798 | /* |
5498 | * reset the NMI-timeout, listing all files on a slow | 5799 | * reset the NMI-timeout, listing all files on a slow |
5499 | * console might take alot of time: | 5800 | * console might take a lot of time: |
5500 | */ | 5801 | */ |
5501 | touch_nmi_watchdog(); | 5802 | touch_nmi_watchdog(); |
5502 | if (!state_filter || (p->state & state_filter)) | 5803 | if (!state_filter || (p->state & state_filter)) |
@@ -5556,22 +5857,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5556 | rcu_read_unlock(); | 5857 | rcu_read_unlock(); |
5557 | 5858 | ||
5558 | rq->curr = rq->idle = idle; | 5859 | rq->curr = rq->idle = idle; |
5559 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5860 | #if defined(CONFIG_SMP) |
5560 | idle->oncpu = 1; | 5861 | idle->on_cpu = 1; |
5561 | #endif | 5862 | #endif |
5562 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5863 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5563 | 5864 | ||
5564 | /* Set the preempt count _outside_ the spinlocks! */ | 5865 | /* Set the preempt count _outside_ the spinlocks! */ |
5565 | #if defined(CONFIG_PREEMPT) | ||
5566 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5567 | #else | ||
5568 | task_thread_info(idle)->preempt_count = 0; | 5866 | task_thread_info(idle)->preempt_count = 0; |
5569 | #endif | 5867 | |
5570 | /* | 5868 | /* |
5571 | * The idle tasks have their own, simple scheduling class: | 5869 | * The idle tasks have their own, simple scheduling class: |
5572 | */ | 5870 | */ |
5573 | idle->sched_class = &idle_sched_class; | 5871 | idle->sched_class = &idle_sched_class; |
5574 | ftrace_graph_init_task(idle); | 5872 | ftrace_graph_init_idle_task(idle, cpu); |
5575 | } | 5873 | } |
5576 | 5874 | ||
5577 | /* | 5875 | /* |
@@ -5661,26 +5959,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5661 | unsigned int dest_cpu; | 5959 | unsigned int dest_cpu; |
5662 | int ret = 0; | 5960 | int ret = 0; |
5663 | 5961 | ||
5664 | /* | ||
5665 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5666 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5667 | */ | ||
5668 | again: | ||
5669 | while (task_is_waking(p)) | ||
5670 | cpu_relax(); | ||
5671 | rq = task_rq_lock(p, &flags); | 5962 | rq = task_rq_lock(p, &flags); |
5672 | if (task_is_waking(p)) { | 5963 | |
5673 | task_rq_unlock(rq, &flags); | 5964 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5674 | goto again; | 5965 | goto out; |
5675 | } | ||
5676 | 5966 | ||
5677 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5967 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5678 | ret = -EINVAL; | 5968 | ret = -EINVAL; |
5679 | goto out; | 5969 | goto out; |
5680 | } | 5970 | } |
5681 | 5971 | ||
5682 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 5972 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5683 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5684 | ret = -EINVAL; | 5973 | ret = -EINVAL; |
5685 | goto out; | 5974 | goto out; |
5686 | } | 5975 | } |
@@ -5697,16 +5986,16 @@ again: | |||
5697 | goto out; | 5986 | goto out; |
5698 | 5987 | ||
5699 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5988 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5700 | if (migrate_task(p, rq)) { | 5989 | if (p->on_rq) { |
5701 | struct migration_arg arg = { p, dest_cpu }; | 5990 | struct migration_arg arg = { p, dest_cpu }; |
5702 | /* Need help from migration thread: drop lock and wait. */ | 5991 | /* Need help from migration thread: drop lock and wait. */ |
5703 | task_rq_unlock(rq, &flags); | 5992 | task_rq_unlock(rq, p, &flags); |
5704 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 5993 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5705 | tlb_migrate_finish(p->mm); | 5994 | tlb_migrate_finish(p->mm); |
5706 | return 0; | 5995 | return 0; |
5707 | } | 5996 | } |
5708 | out: | 5997 | out: |
5709 | task_rq_unlock(rq, &flags); | 5998 | task_rq_unlock(rq, p, &flags); |
5710 | 5999 | ||
5711 | return ret; | 6000 | return ret; |
5712 | } | 6001 | } |
@@ -5734,6 +6023,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5734 | rq_src = cpu_rq(src_cpu); | 6023 | rq_src = cpu_rq(src_cpu); |
5735 | rq_dest = cpu_rq(dest_cpu); | 6024 | rq_dest = cpu_rq(dest_cpu); |
5736 | 6025 | ||
6026 | raw_spin_lock(&p->pi_lock); | ||
5737 | double_rq_lock(rq_src, rq_dest); | 6027 | double_rq_lock(rq_src, rq_dest); |
5738 | /* Already moved. */ | 6028 | /* Already moved. */ |
5739 | if (task_cpu(p) != src_cpu) | 6029 | if (task_cpu(p) != src_cpu) |
@@ -5746,7 +6036,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5746 | * If we're not on a rq, the next wake-up will ensure we're | 6036 | * If we're not on a rq, the next wake-up will ensure we're |
5747 | * placed properly. | 6037 | * placed properly. |
5748 | */ | 6038 | */ |
5749 | if (p->se.on_rq) { | 6039 | if (p->on_rq) { |
5750 | deactivate_task(rq_src, p, 0); | 6040 | deactivate_task(rq_src, p, 0); |
5751 | set_task_cpu(p, dest_cpu); | 6041 | set_task_cpu(p, dest_cpu); |
5752 | activate_task(rq_dest, p, 0); | 6042 | activate_task(rq_dest, p, 0); |
@@ -5756,6 +6046,7 @@ done: | |||
5756 | ret = 1; | 6046 | ret = 1; |
5757 | fail: | 6047 | fail: |
5758 | double_rq_unlock(rq_src, rq_dest); | 6048 | double_rq_unlock(rq_src, rq_dest); |
6049 | raw_spin_unlock(&p->pi_lock); | ||
5759 | return ret; | 6050 | return ret; |
5760 | } | 6051 | } |
5761 | 6052 | ||
@@ -6096,6 +6387,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6096 | 6387 | ||
6097 | #ifdef CONFIG_HOTPLUG_CPU | 6388 | #ifdef CONFIG_HOTPLUG_CPU |
6098 | case CPU_DYING: | 6389 | case CPU_DYING: |
6390 | sched_ttwu_pending(); | ||
6099 | /* Update our root-domain */ | 6391 | /* Update our root-domain */ |
6100 | raw_spin_lock_irqsave(&rq->lock, flags); | 6392 | raw_spin_lock_irqsave(&rq->lock, flags); |
6101 | if (rq->rd) { | 6393 | if (rq->rd) { |
@@ -6111,6 +6403,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6111 | break; | 6403 | break; |
6112 | #endif | 6404 | #endif |
6113 | } | 6405 | } |
6406 | |||
6407 | update_max_interval(); | ||
6408 | |||
6114 | return NOTIFY_OK; | 6409 | return NOTIFY_OK; |
6115 | } | 6410 | } |
6116 | 6411 | ||
@@ -6171,6 +6466,8 @@ early_initcall(migration_init); | |||
6171 | 6466 | ||
6172 | #ifdef CONFIG_SMP | 6467 | #ifdef CONFIG_SMP |
6173 | 6468 | ||
6469 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6470 | |||
6174 | #ifdef CONFIG_SCHED_DEBUG | 6471 | #ifdef CONFIG_SCHED_DEBUG |
6175 | 6472 | ||
6176 | static __read_mostly int sched_domain_debug_enabled; | 6473 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6245,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6245 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6542 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6246 | 6543 | ||
6247 | printk(KERN_CONT " %s", str); | 6544 | printk(KERN_CONT " %s", str); |
6248 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6545 | if (group->cpu_power != SCHED_POWER_SCALE) { |
6249 | printk(KERN_CONT " (cpu_power = %d)", | 6546 | printk(KERN_CONT " (cpu_power = %d)", |
6250 | group->cpu_power); | 6547 | group->cpu_power); |
6251 | } | 6548 | } |
@@ -6266,7 +6563,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6266 | 6563 | ||
6267 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6564 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6268 | { | 6565 | { |
6269 | cpumask_var_t groupmask; | ||
6270 | int level = 0; | 6566 | int level = 0; |
6271 | 6567 | ||
6272 | if (!sched_domain_debug_enabled) | 6568 | if (!sched_domain_debug_enabled) |
@@ -6279,20 +6575,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6279 | 6575 | ||
6280 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6576 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6281 | 6577 | ||
6282 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6283 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6284 | return; | ||
6285 | } | ||
6286 | |||
6287 | for (;;) { | 6578 | for (;;) { |
6288 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6579 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6289 | break; | 6580 | break; |
6290 | level++; | 6581 | level++; |
6291 | sd = sd->parent; | 6582 | sd = sd->parent; |
6292 | if (!sd) | 6583 | if (!sd) |
6293 | break; | 6584 | break; |
6294 | } | 6585 | } |
6295 | free_cpumask_var(groupmask); | ||
6296 | } | 6586 | } |
6297 | #else /* !CONFIG_SCHED_DEBUG */ | 6587 | #else /* !CONFIG_SCHED_DEBUG */ |
6298 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6588 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6349,12 +6639,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6349 | return 1; | 6639 | return 1; |
6350 | } | 6640 | } |
6351 | 6641 | ||
6352 | static void free_rootdomain(struct root_domain *rd) | 6642 | static void free_rootdomain(struct rcu_head *rcu) |
6353 | { | 6643 | { |
6354 | synchronize_sched(); | 6644 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6355 | 6645 | ||
6356 | cpupri_cleanup(&rd->cpupri); | 6646 | cpupri_cleanup(&rd->cpupri); |
6357 | |||
6358 | free_cpumask_var(rd->rto_mask); | 6647 | free_cpumask_var(rd->rto_mask); |
6359 | free_cpumask_var(rd->online); | 6648 | free_cpumask_var(rd->online); |
6360 | free_cpumask_var(rd->span); | 6649 | free_cpumask_var(rd->span); |
@@ -6395,7 +6684,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6395 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6684 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6396 | 6685 | ||
6397 | if (old_rd) | 6686 | if (old_rd) |
6398 | free_rootdomain(old_rd); | 6687 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6399 | } | 6688 | } |
6400 | 6689 | ||
6401 | static int init_rootdomain(struct root_domain *rd) | 6690 | static int init_rootdomain(struct root_domain *rd) |
@@ -6446,6 +6735,25 @@ static struct root_domain *alloc_rootdomain(void) | |||
6446 | return rd; | 6735 | return rd; |
6447 | } | 6736 | } |
6448 | 6737 | ||
6738 | static void free_sched_domain(struct rcu_head *rcu) | ||
6739 | { | ||
6740 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6741 | if (atomic_dec_and_test(&sd->groups->ref)) | ||
6742 | kfree(sd->groups); | ||
6743 | kfree(sd); | ||
6744 | } | ||
6745 | |||
6746 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6747 | { | ||
6748 | call_rcu(&sd->rcu, free_sched_domain); | ||
6749 | } | ||
6750 | |||
6751 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6752 | { | ||
6753 | for (; sd; sd = sd->parent) | ||
6754 | destroy_sched_domain(sd, cpu); | ||
6755 | } | ||
6756 | |||
6449 | /* | 6757 | /* |
6450 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6758 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6451 | * hold the hotplug lock. | 6759 | * hold the hotplug lock. |
@@ -6456,9 +6764,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6456 | struct rq *rq = cpu_rq(cpu); | 6764 | struct rq *rq = cpu_rq(cpu); |
6457 | struct sched_domain *tmp; | 6765 | struct sched_domain *tmp; |
6458 | 6766 | ||
6459 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6460 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6461 | |||
6462 | /* Remove the sched domains which do not contribute to scheduling. */ | 6767 | /* Remove the sched domains which do not contribute to scheduling. */ |
6463 | for (tmp = sd; tmp; ) { | 6768 | for (tmp = sd; tmp; ) { |
6464 | struct sched_domain *parent = tmp->parent; | 6769 | struct sched_domain *parent = tmp->parent; |
@@ -6469,12 +6774,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6469 | tmp->parent = parent->parent; | 6774 | tmp->parent = parent->parent; |
6470 | if (parent->parent) | 6775 | if (parent->parent) |
6471 | parent->parent->child = tmp; | 6776 | parent->parent->child = tmp; |
6777 | destroy_sched_domain(parent, cpu); | ||
6472 | } else | 6778 | } else |
6473 | tmp = tmp->parent; | 6779 | tmp = tmp->parent; |
6474 | } | 6780 | } |
6475 | 6781 | ||
6476 | if (sd && sd_degenerate(sd)) { | 6782 | if (sd && sd_degenerate(sd)) { |
6783 | tmp = sd; | ||
6477 | sd = sd->parent; | 6784 | sd = sd->parent; |
6785 | destroy_sched_domain(tmp, cpu); | ||
6478 | if (sd) | 6786 | if (sd) |
6479 | sd->child = NULL; | 6787 | sd->child = NULL; |
6480 | } | 6788 | } |
@@ -6482,7 +6790,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6482 | sched_domain_debug(sd, cpu); | 6790 | sched_domain_debug(sd, cpu); |
6483 | 6791 | ||
6484 | rq_attach_root(rq, rd); | 6792 | rq_attach_root(rq, rd); |
6793 | tmp = rq->sd; | ||
6485 | rcu_assign_pointer(rq->sd, sd); | 6794 | rcu_assign_pointer(rq->sd, sd); |
6795 | destroy_sched_domains(tmp, cpu); | ||
6486 | } | 6796 | } |
6487 | 6797 | ||
6488 | /* cpus with isolated domains */ | 6798 | /* cpus with isolated domains */ |
@@ -6498,56 +6808,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6498 | 6808 | ||
6499 | __setup("isolcpus=", isolated_cpu_setup); | 6809 | __setup("isolcpus=", isolated_cpu_setup); |
6500 | 6810 | ||
6501 | /* | ||
6502 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6503 | * to a function which identifies what group(along with sched group) a CPU | ||
6504 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6505 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6506 | * | ||
6507 | * init_sched_build_groups will build a circular linked list of the groups | ||
6508 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6509 | * and ->cpu_power to 0. | ||
6510 | */ | ||
6511 | static void | ||
6512 | init_sched_build_groups(const struct cpumask *span, | ||
6513 | const struct cpumask *cpu_map, | ||
6514 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6515 | struct sched_group **sg, | ||
6516 | struct cpumask *tmpmask), | ||
6517 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6518 | { | ||
6519 | struct sched_group *first = NULL, *last = NULL; | ||
6520 | int i; | ||
6521 | |||
6522 | cpumask_clear(covered); | ||
6523 | |||
6524 | for_each_cpu(i, span) { | ||
6525 | struct sched_group *sg; | ||
6526 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6527 | int j; | ||
6528 | |||
6529 | if (cpumask_test_cpu(i, covered)) | ||
6530 | continue; | ||
6531 | |||
6532 | cpumask_clear(sched_group_cpus(sg)); | ||
6533 | sg->cpu_power = 0; | ||
6534 | |||
6535 | for_each_cpu(j, span) { | ||
6536 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6537 | continue; | ||
6538 | |||
6539 | cpumask_set_cpu(j, covered); | ||
6540 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6541 | } | ||
6542 | if (!first) | ||
6543 | first = sg; | ||
6544 | if (last) | ||
6545 | last->next = sg; | ||
6546 | last = sg; | ||
6547 | } | ||
6548 | last->next = first; | ||
6549 | } | ||
6550 | |||
6551 | #define SD_NODES_PER_DOMAIN 16 | 6811 | #define SD_NODES_PER_DOMAIN 16 |
6552 | 6812 | ||
6553 | #ifdef CONFIG_NUMA | 6813 | #ifdef CONFIG_NUMA |
@@ -6564,7 +6824,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6564 | */ | 6824 | */ |
6565 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6825 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6566 | { | 6826 | { |
6567 | int i, n, val, min_val, best_node = 0; | 6827 | int i, n, val, min_val, best_node = -1; |
6568 | 6828 | ||
6569 | min_val = INT_MAX; | 6829 | min_val = INT_MAX; |
6570 | 6830 | ||
@@ -6588,7 +6848,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6588 | } | 6848 | } |
6589 | } | 6849 | } |
6590 | 6850 | ||
6591 | node_set(best_node, *used_nodes); | 6851 | if (best_node != -1) |
6852 | node_set(best_node, *used_nodes); | ||
6592 | return best_node; | 6853 | return best_node; |
6593 | } | 6854 | } |
6594 | 6855 | ||
@@ -6614,315 +6875,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6614 | 6875 | ||
6615 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6876 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6616 | int next_node = find_next_best_node(node, &used_nodes); | 6877 | int next_node = find_next_best_node(node, &used_nodes); |
6617 | 6878 | if (next_node < 0) | |
6879 | break; | ||
6618 | cpumask_or(span, span, cpumask_of_node(next_node)); | 6880 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6619 | } | 6881 | } |
6620 | } | 6882 | } |
6883 | |||
6884 | static const struct cpumask *cpu_node_mask(int cpu) | ||
6885 | { | ||
6886 | lockdep_assert_held(&sched_domains_mutex); | ||
6887 | |||
6888 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
6889 | |||
6890 | return sched_domains_tmpmask; | ||
6891 | } | ||
6892 | |||
6893 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
6894 | { | ||
6895 | return cpu_possible_mask; | ||
6896 | } | ||
6621 | #endif /* CONFIG_NUMA */ | 6897 | #endif /* CONFIG_NUMA */ |
6622 | 6898 | ||
6623 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6899 | static const struct cpumask *cpu_cpu_mask(int cpu) |
6900 | { | ||
6901 | return cpumask_of_node(cpu_to_node(cpu)); | ||
6902 | } | ||
6624 | 6903 | ||
6625 | /* | 6904 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6626 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6627 | * | ||
6628 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6629 | * and struct sched_domain. ) | ||
6630 | */ | ||
6631 | struct static_sched_group { | ||
6632 | struct sched_group sg; | ||
6633 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6634 | }; | ||
6635 | 6905 | ||
6636 | struct static_sched_domain { | 6906 | struct sd_data { |
6637 | struct sched_domain sd; | 6907 | struct sched_domain **__percpu sd; |
6638 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 6908 | struct sched_group **__percpu sg; |
6639 | }; | 6909 | }; |
6640 | 6910 | ||
6641 | struct s_data { | 6911 | struct s_data { |
6642 | #ifdef CONFIG_NUMA | 6912 | struct sched_domain ** __percpu sd; |
6643 | int sd_allnodes; | ||
6644 | cpumask_var_t domainspan; | ||
6645 | cpumask_var_t covered; | ||
6646 | cpumask_var_t notcovered; | ||
6647 | #endif | ||
6648 | cpumask_var_t nodemask; | ||
6649 | cpumask_var_t this_sibling_map; | ||
6650 | cpumask_var_t this_core_map; | ||
6651 | cpumask_var_t this_book_map; | ||
6652 | cpumask_var_t send_covered; | ||
6653 | cpumask_var_t tmpmask; | ||
6654 | struct sched_group **sched_group_nodes; | ||
6655 | struct root_domain *rd; | 6913 | struct root_domain *rd; |
6656 | }; | 6914 | }; |
6657 | 6915 | ||
6658 | enum s_alloc { | 6916 | enum s_alloc { |
6659 | sa_sched_groups = 0, | ||
6660 | sa_rootdomain, | 6917 | sa_rootdomain, |
6661 | sa_tmpmask, | 6918 | sa_sd, |
6662 | sa_send_covered, | 6919 | sa_sd_storage, |
6663 | sa_this_book_map, | ||
6664 | sa_this_core_map, | ||
6665 | sa_this_sibling_map, | ||
6666 | sa_nodemask, | ||
6667 | sa_sched_group_nodes, | ||
6668 | #ifdef CONFIG_NUMA | ||
6669 | sa_notcovered, | ||
6670 | sa_covered, | ||
6671 | sa_domainspan, | ||
6672 | #endif | ||
6673 | sa_none, | 6920 | sa_none, |
6674 | }; | 6921 | }; |
6675 | 6922 | ||
6676 | /* | 6923 | struct sched_domain_topology_level; |
6677 | * SMT sched-domains: | ||
6678 | */ | ||
6679 | #ifdef CONFIG_SCHED_SMT | ||
6680 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6681 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6682 | 6924 | ||
6683 | static int | 6925 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6684 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 6926 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6685 | struct sched_group **sg, struct cpumask *unused) | ||
6686 | { | ||
6687 | if (sg) | ||
6688 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6689 | return cpu; | ||
6690 | } | ||
6691 | #endif /* CONFIG_SCHED_SMT */ | ||
6692 | 6927 | ||
6693 | /* | 6928 | struct sched_domain_topology_level { |
6694 | * multi-core sched-domains: | 6929 | sched_domain_init_f init; |
6695 | */ | 6930 | sched_domain_mask_f mask; |
6696 | #ifdef CONFIG_SCHED_MC | 6931 | struct sd_data data; |
6697 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6932 | }; |
6698 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6699 | |||
6700 | static int | ||
6701 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | ||
6702 | struct sched_group **sg, struct cpumask *mask) | ||
6703 | { | ||
6704 | int group; | ||
6705 | #ifdef CONFIG_SCHED_SMT | ||
6706 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6707 | group = cpumask_first(mask); | ||
6708 | #else | ||
6709 | group = cpu; | ||
6710 | #endif | ||
6711 | if (sg) | ||
6712 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6713 | return group; | ||
6714 | } | ||
6715 | #endif /* CONFIG_SCHED_MC */ | ||
6716 | 6933 | ||
6717 | /* | 6934 | /* |
6718 | * book sched-domains: | 6935 | * Assumes the sched_domain tree is fully constructed |
6719 | */ | 6936 | */ |
6720 | #ifdef CONFIG_SCHED_BOOK | 6937 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6721 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6722 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6723 | |||
6724 | static int | ||
6725 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, | ||
6726 | struct sched_group **sg, struct cpumask *mask) | ||
6727 | { | 6938 | { |
6728 | int group = cpu; | 6939 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6729 | #ifdef CONFIG_SCHED_MC | 6940 | struct sched_domain *child = sd->child; |
6730 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6731 | group = cpumask_first(mask); | ||
6732 | #elif defined(CONFIG_SCHED_SMT) | ||
6733 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6734 | group = cpumask_first(mask); | ||
6735 | #endif | ||
6736 | if (sg) | ||
6737 | *sg = &per_cpu(sched_group_book, group).sg; | ||
6738 | return group; | ||
6739 | } | ||
6740 | #endif /* CONFIG_SCHED_BOOK */ | ||
6741 | 6941 | ||
6742 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6942 | if (child) |
6743 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6943 | cpu = cpumask_first(sched_domain_span(child)); |
6744 | 6944 | ||
6745 | static int | ||
6746 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | ||
6747 | struct sched_group **sg, struct cpumask *mask) | ||
6748 | { | ||
6749 | int group; | ||
6750 | #ifdef CONFIG_SCHED_BOOK | ||
6751 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6752 | group = cpumask_first(mask); | ||
6753 | #elif defined(CONFIG_SCHED_MC) | ||
6754 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6755 | group = cpumask_first(mask); | ||
6756 | #elif defined(CONFIG_SCHED_SMT) | ||
6757 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6758 | group = cpumask_first(mask); | ||
6759 | #else | ||
6760 | group = cpu; | ||
6761 | #endif | ||
6762 | if (sg) | 6945 | if (sg) |
6763 | *sg = &per_cpu(sched_group_phys, group).sg; | 6946 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6764 | return group; | 6947 | |
6948 | return cpu; | ||
6765 | } | 6949 | } |
6766 | 6950 | ||
6767 | #ifdef CONFIG_NUMA | ||
6768 | /* | 6951 | /* |
6769 | * The init_sched_build_groups can't handle what we want to do with node | 6952 | * build_sched_groups takes the cpumask we wish to span, and a pointer |
6770 | * groups, so roll our own. Now each node has its own list of groups which | 6953 | * to a function which identifies what group(along with sched group) a CPU |
6771 | * gets dynamically allocated. | 6954 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids |
6955 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6956 | * | ||
6957 | * build_sched_groups will build a circular linked list of the groups | ||
6958 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6959 | * and ->cpu_power to 0. | ||
6772 | */ | 6960 | */ |
6773 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | 6961 | static void |
6774 | static struct sched_group ***sched_group_nodes_bycpu; | 6962 | build_sched_groups(struct sched_domain *sd) |
6775 | |||
6776 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | ||
6777 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
6778 | |||
6779 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | ||
6780 | struct sched_group **sg, | ||
6781 | struct cpumask *nodemask) | ||
6782 | { | ||
6783 | int group; | ||
6784 | |||
6785 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | ||
6786 | group = cpumask_first(nodemask); | ||
6787 | |||
6788 | if (sg) | ||
6789 | *sg = &per_cpu(sched_group_allnodes, group).sg; | ||
6790 | return group; | ||
6791 | } | ||
6792 | |||
6793 | static void init_numa_sched_groups_power(struct sched_group *group_head) | ||
6794 | { | ||
6795 | struct sched_group *sg = group_head; | ||
6796 | int j; | ||
6797 | |||
6798 | if (!sg) | ||
6799 | return; | ||
6800 | do { | ||
6801 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
6802 | struct sched_domain *sd; | ||
6803 | |||
6804 | sd = &per_cpu(phys_domains, j).sd; | ||
6805 | if (j != group_first_cpu(sd->groups)) { | ||
6806 | /* | ||
6807 | * Only add "power" once for each | ||
6808 | * physical package. | ||
6809 | */ | ||
6810 | continue; | ||
6811 | } | ||
6812 | |||
6813 | sg->cpu_power += sd->groups->cpu_power; | ||
6814 | } | ||
6815 | sg = sg->next; | ||
6816 | } while (sg != group_head); | ||
6817 | } | ||
6818 | |||
6819 | static int build_numa_sched_groups(struct s_data *d, | ||
6820 | const struct cpumask *cpu_map, int num) | ||
6821 | { | 6963 | { |
6822 | struct sched_domain *sd; | 6964 | struct sched_group *first = NULL, *last = NULL; |
6823 | struct sched_group *sg, *prev; | 6965 | struct sd_data *sdd = sd->private; |
6824 | int n, j; | 6966 | const struct cpumask *span = sched_domain_span(sd); |
6825 | 6967 | struct cpumask *covered; | |
6826 | cpumask_clear(d->covered); | 6968 | int i; |
6827 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
6828 | if (cpumask_empty(d->nodemask)) { | ||
6829 | d->sched_group_nodes[num] = NULL; | ||
6830 | goto out; | ||
6831 | } | ||
6832 | |||
6833 | sched_domain_node_span(num, d->domainspan); | ||
6834 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
6835 | |||
6836 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6837 | GFP_KERNEL, num); | ||
6838 | if (!sg) { | ||
6839 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
6840 | num); | ||
6841 | return -ENOMEM; | ||
6842 | } | ||
6843 | d->sched_group_nodes[num] = sg; | ||
6844 | |||
6845 | for_each_cpu(j, d->nodemask) { | ||
6846 | sd = &per_cpu(node_domains, j).sd; | ||
6847 | sd->groups = sg; | ||
6848 | } | ||
6849 | |||
6850 | sg->cpu_power = 0; | ||
6851 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
6852 | sg->next = sg; | ||
6853 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
6854 | 6969 | ||
6855 | prev = sg; | 6970 | lockdep_assert_held(&sched_domains_mutex); |
6856 | for (j = 0; j < nr_node_ids; j++) { | 6971 | covered = sched_domains_tmpmask; |
6857 | n = (num + j) % nr_node_ids; | ||
6858 | cpumask_complement(d->notcovered, d->covered); | ||
6859 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
6860 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
6861 | if (cpumask_empty(d->tmpmask)) | ||
6862 | break; | ||
6863 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
6864 | if (cpumask_empty(d->tmpmask)) | ||
6865 | continue; | ||
6866 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6867 | GFP_KERNEL, num); | ||
6868 | if (!sg) { | ||
6869 | printk(KERN_WARNING | ||
6870 | "Can not alloc domain group for node %d\n", j); | ||
6871 | return -ENOMEM; | ||
6872 | } | ||
6873 | sg->cpu_power = 0; | ||
6874 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
6875 | sg->next = prev->next; | ||
6876 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
6877 | prev->next = sg; | ||
6878 | prev = sg; | ||
6879 | } | ||
6880 | out: | ||
6881 | return 0; | ||
6882 | } | ||
6883 | #endif /* CONFIG_NUMA */ | ||
6884 | 6972 | ||
6885 | #ifdef CONFIG_NUMA | 6973 | cpumask_clear(covered); |
6886 | /* Free memory allocated for various sched_group structures */ | ||
6887 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6888 | struct cpumask *nodemask) | ||
6889 | { | ||
6890 | int cpu, i; | ||
6891 | 6974 | ||
6892 | for_each_cpu(cpu, cpu_map) { | 6975 | for_each_cpu(i, span) { |
6893 | struct sched_group **sched_group_nodes | 6976 | struct sched_group *sg; |
6894 | = sched_group_nodes_bycpu[cpu]; | 6977 | int group = get_group(i, sdd, &sg); |
6978 | int j; | ||
6895 | 6979 | ||
6896 | if (!sched_group_nodes) | 6980 | if (cpumask_test_cpu(i, covered)) |
6897 | continue; | 6981 | continue; |
6898 | 6982 | ||
6899 | for (i = 0; i < nr_node_ids; i++) { | 6983 | cpumask_clear(sched_group_cpus(sg)); |
6900 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 6984 | sg->cpu_power = 0; |
6901 | 6985 | ||
6902 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 6986 | for_each_cpu(j, span) { |
6903 | if (cpumask_empty(nodemask)) | 6987 | if (get_group(j, sdd, NULL) != group) |
6904 | continue; | 6988 | continue; |
6905 | 6989 | ||
6906 | if (sg == NULL) | 6990 | cpumask_set_cpu(j, covered); |
6907 | continue; | 6991 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
6908 | sg = sg->next; | ||
6909 | next_sg: | ||
6910 | oldsg = sg; | ||
6911 | sg = sg->next; | ||
6912 | kfree(oldsg); | ||
6913 | if (oldsg != sched_group_nodes[i]) | ||
6914 | goto next_sg; | ||
6915 | } | 6992 | } |
6916 | kfree(sched_group_nodes); | 6993 | |
6917 | sched_group_nodes_bycpu[cpu] = NULL; | 6994 | if (!first) |
6995 | first = sg; | ||
6996 | if (last) | ||
6997 | last->next = sg; | ||
6998 | last = sg; | ||
6918 | } | 6999 | } |
7000 | last->next = first; | ||
6919 | } | 7001 | } |
6920 | #else /* !CONFIG_NUMA */ | ||
6921 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6922 | struct cpumask *nodemask) | ||
6923 | { | ||
6924 | } | ||
6925 | #endif /* CONFIG_NUMA */ | ||
6926 | 7002 | ||
6927 | /* | 7003 | /* |
6928 | * Initialize sched groups cpu_power. | 7004 | * Initialize sched groups cpu_power. |
@@ -6936,11 +7012,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
6936 | */ | 7012 | */ |
6937 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7013 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6938 | { | 7014 | { |
6939 | struct sched_domain *child; | ||
6940 | struct sched_group *group; | ||
6941 | long power; | ||
6942 | int weight; | ||
6943 | |||
6944 | WARN_ON(!sd || !sd->groups); | 7015 | WARN_ON(!sd || !sd->groups); |
6945 | 7016 | ||
6946 | if (cpu != group_first_cpu(sd->groups)) | 7017 | if (cpu != group_first_cpu(sd->groups)) |
@@ -6948,36 +7019,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6948 | 7019 | ||
6949 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7020 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
6950 | 7021 | ||
6951 | child = sd->child; | 7022 | update_group_power(sd, cpu); |
6952 | |||
6953 | sd->groups->cpu_power = 0; | ||
6954 | |||
6955 | if (!child) { | ||
6956 | power = SCHED_LOAD_SCALE; | ||
6957 | weight = cpumask_weight(sched_domain_span(sd)); | ||
6958 | /* | ||
6959 | * SMT siblings share the power of a single core. | ||
6960 | * Usually multiple threads get a better yield out of | ||
6961 | * that one core than a single thread would have, | ||
6962 | * reflect that in sd->smt_gain. | ||
6963 | */ | ||
6964 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
6965 | power *= sd->smt_gain; | ||
6966 | power /= weight; | ||
6967 | power >>= SCHED_LOAD_SHIFT; | ||
6968 | } | ||
6969 | sd->groups->cpu_power += power; | ||
6970 | return; | ||
6971 | } | ||
6972 | |||
6973 | /* | ||
6974 | * Add cpu_power of each child group to this groups cpu_power. | ||
6975 | */ | ||
6976 | group = child->groups; | ||
6977 | do { | ||
6978 | sd->groups->cpu_power += group->cpu_power; | ||
6979 | group = group->next; | ||
6980 | } while (group != child->groups); | ||
6981 | } | 7023 | } |
6982 | 7024 | ||
6983 | /* | 7025 | /* |
@@ -6991,15 +7033,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6991 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7033 | # define SD_INIT_NAME(sd, type) do { } while (0) |
6992 | #endif | 7034 | #endif |
6993 | 7035 | ||
6994 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7036 | #define SD_INIT_FUNC(type) \ |
6995 | 7037 | static noinline struct sched_domain * \ | |
6996 | #define SD_INIT_FUNC(type) \ | 7038 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
6997 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7039 | { \ |
6998 | { \ | 7040 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
6999 | memset(sd, 0, sizeof(*sd)); \ | 7041 | *sd = SD_##type##_INIT; \ |
7000 | *sd = SD_##type##_INIT; \ | 7042 | SD_INIT_NAME(sd, type); \ |
7001 | sd->level = SD_LV_##type; \ | 7043 | sd->private = &tl->data; \ |
7002 | SD_INIT_NAME(sd, type); \ | 7044 | return sd; \ |
7003 | } | 7045 | } |
7004 | 7046 | ||
7005 | SD_INIT_FUNC(CPU) | 7047 | SD_INIT_FUNC(CPU) |
@@ -7018,13 +7060,14 @@ SD_INIT_FUNC(CPU) | |||
7018 | #endif | 7060 | #endif |
7019 | 7061 | ||
7020 | static int default_relax_domain_level = -1; | 7062 | static int default_relax_domain_level = -1; |
7063 | int sched_domain_level_max; | ||
7021 | 7064 | ||
7022 | static int __init setup_relax_domain_level(char *str) | 7065 | static int __init setup_relax_domain_level(char *str) |
7023 | { | 7066 | { |
7024 | unsigned long val; | 7067 | unsigned long val; |
7025 | 7068 | ||
7026 | val = simple_strtoul(str, NULL, 0); | 7069 | val = simple_strtoul(str, NULL, 0); |
7027 | if (val < SD_LV_MAX) | 7070 | if (val < sched_domain_level_max) |
7028 | default_relax_domain_level = val; | 7071 | default_relax_domain_level = val; |
7029 | 7072 | ||
7030 | return 1; | 7073 | return 1; |
@@ -7052,37 +7095,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
7052 | } | 7095 | } |
7053 | } | 7096 | } |
7054 | 7097 | ||
7098 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7099 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7100 | |||
7055 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7101 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
7056 | const struct cpumask *cpu_map) | 7102 | const struct cpumask *cpu_map) |
7057 | { | 7103 | { |
7058 | switch (what) { | 7104 | switch (what) { |
7059 | case sa_sched_groups: | ||
7060 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7061 | d->sched_group_nodes = NULL; | ||
7062 | case sa_rootdomain: | 7105 | case sa_rootdomain: |
7063 | free_rootdomain(d->rd); /* fall through */ | 7106 | if (!atomic_read(&d->rd->refcount)) |
7064 | case sa_tmpmask: | 7107 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7065 | free_cpumask_var(d->tmpmask); /* fall through */ | 7108 | case sa_sd: |
7066 | case sa_send_covered: | 7109 | free_percpu(d->sd); /* fall through */ |
7067 | free_cpumask_var(d->send_covered); /* fall through */ | 7110 | case sa_sd_storage: |
7068 | case sa_this_book_map: | 7111 | __sdt_free(cpu_map); /* fall through */ |
7069 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
7070 | case sa_this_core_map: | ||
7071 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7072 | case sa_this_sibling_map: | ||
7073 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7074 | case sa_nodemask: | ||
7075 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7076 | case sa_sched_group_nodes: | ||
7077 | #ifdef CONFIG_NUMA | ||
7078 | kfree(d->sched_group_nodes); /* fall through */ | ||
7079 | case sa_notcovered: | ||
7080 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7081 | case sa_covered: | ||
7082 | free_cpumask_var(d->covered); /* fall through */ | ||
7083 | case sa_domainspan: | ||
7084 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7085 | #endif | ||
7086 | case sa_none: | 7112 | case sa_none: |
7087 | break; | 7113 | break; |
7088 | } | 7114 | } |
@@ -7091,308 +7117,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7091 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7117 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7092 | const struct cpumask *cpu_map) | 7118 | const struct cpumask *cpu_map) |
7093 | { | 7119 | { |
7094 | #ifdef CONFIG_NUMA | 7120 | memset(d, 0, sizeof(*d)); |
7095 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7121 | |
7096 | return sa_none; | 7122 | if (__sdt_alloc(cpu_map)) |
7097 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7123 | return sa_sd_storage; |
7098 | return sa_domainspan; | 7124 | d->sd = alloc_percpu(struct sched_domain *); |
7099 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7125 | if (!d->sd) |
7100 | return sa_covered; | 7126 | return sa_sd_storage; |
7101 | /* Allocate the per-node list of sched groups */ | ||
7102 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7103 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7104 | if (!d->sched_group_nodes) { | ||
7105 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7106 | return sa_notcovered; | ||
7107 | } | ||
7108 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7109 | #endif | ||
7110 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7111 | return sa_sched_group_nodes; | ||
7112 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7113 | return sa_nodemask; | ||
7114 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7115 | return sa_this_sibling_map; | ||
7116 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) | ||
7117 | return sa_this_core_map; | ||
7118 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7119 | return sa_this_book_map; | ||
7120 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7121 | return sa_send_covered; | ||
7122 | d->rd = alloc_rootdomain(); | 7127 | d->rd = alloc_rootdomain(); |
7123 | if (!d->rd) { | 7128 | if (!d->rd) |
7124 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7129 | return sa_sd; |
7125 | return sa_tmpmask; | ||
7126 | } | ||
7127 | return sa_rootdomain; | 7130 | return sa_rootdomain; |
7128 | } | 7131 | } |
7129 | 7132 | ||
7130 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7133 | /* |
7131 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7134 | * NULL the sd_data elements we've used to build the sched_domain and |
7135 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7136 | * will not free the data we're using. | ||
7137 | */ | ||
7138 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7132 | { | 7139 | { |
7133 | struct sched_domain *sd = NULL; | 7140 | struct sd_data *sdd = sd->private; |
7134 | #ifdef CONFIG_NUMA | 7141 | struct sched_group *sg = sd->groups; |
7135 | struct sched_domain *parent; | ||
7136 | |||
7137 | d->sd_allnodes = 0; | ||
7138 | if (cpumask_weight(cpu_map) > | ||
7139 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7140 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7141 | SD_INIT(sd, ALLNODES); | ||
7142 | set_domain_attribute(sd, attr); | ||
7143 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7144 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7145 | d->sd_allnodes = 1; | ||
7146 | } | ||
7147 | parent = sd; | ||
7148 | |||
7149 | sd = &per_cpu(node_domains, i).sd; | ||
7150 | SD_INIT(sd, NODE); | ||
7151 | set_domain_attribute(sd, attr); | ||
7152 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7153 | sd->parent = parent; | ||
7154 | if (parent) | ||
7155 | parent->child = sd; | ||
7156 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7157 | #endif | ||
7158 | return sd; | ||
7159 | } | ||
7160 | 7142 | ||
7161 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7143 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7162 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7144 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7163 | struct sched_domain *parent, int i) | ||
7164 | { | ||
7165 | struct sched_domain *sd; | ||
7166 | sd = &per_cpu(phys_domains, i).sd; | ||
7167 | SD_INIT(sd, CPU); | ||
7168 | set_domain_attribute(sd, attr); | ||
7169 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7170 | sd->parent = parent; | ||
7171 | if (parent) | ||
7172 | parent->child = sd; | ||
7173 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7174 | return sd; | ||
7175 | } | ||
7176 | 7145 | ||
7177 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | 7146 | if (cpu == cpumask_first(sched_group_cpus(sg))) { |
7178 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7147 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); |
7179 | struct sched_domain *parent, int i) | 7148 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7180 | { | 7149 | } |
7181 | struct sched_domain *sd = parent; | ||
7182 | #ifdef CONFIG_SCHED_BOOK | ||
7183 | sd = &per_cpu(book_domains, i).sd; | ||
7184 | SD_INIT(sd, BOOK); | ||
7185 | set_domain_attribute(sd, attr); | ||
7186 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7187 | sd->parent = parent; | ||
7188 | parent->child = sd; | ||
7189 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7190 | #endif | ||
7191 | return sd; | ||
7192 | } | 7150 | } |
7193 | 7151 | ||
7194 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7152 | #ifdef CONFIG_SCHED_SMT |
7195 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7153 | static const struct cpumask *cpu_smt_mask(int cpu) |
7196 | struct sched_domain *parent, int i) | ||
7197 | { | 7154 | { |
7198 | struct sched_domain *sd = parent; | 7155 | return topology_thread_cpumask(cpu); |
7199 | #ifdef CONFIG_SCHED_MC | ||
7200 | sd = &per_cpu(core_domains, i).sd; | ||
7201 | SD_INIT(sd, MC); | ||
7202 | set_domain_attribute(sd, attr); | ||
7203 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7204 | sd->parent = parent; | ||
7205 | parent->child = sd; | ||
7206 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7207 | #endif | ||
7208 | return sd; | ||
7209 | } | 7156 | } |
7210 | |||
7211 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7212 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7213 | struct sched_domain *parent, int i) | ||
7214 | { | ||
7215 | struct sched_domain *sd = parent; | ||
7216 | #ifdef CONFIG_SCHED_SMT | ||
7217 | sd = &per_cpu(cpu_domains, i).sd; | ||
7218 | SD_INIT(sd, SIBLING); | ||
7219 | set_domain_attribute(sd, attr); | ||
7220 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7221 | sd->parent = parent; | ||
7222 | parent->child = sd; | ||
7223 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7224 | #endif | 7157 | #endif |
7225 | return sd; | ||
7226 | } | ||
7227 | 7158 | ||
7228 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7159 | /* |
7229 | const struct cpumask *cpu_map, int cpu) | 7160 | * Topology list, bottom-up. |
7230 | { | 7161 | */ |
7231 | switch (l) { | 7162 | static struct sched_domain_topology_level default_topology[] = { |
7232 | #ifdef CONFIG_SCHED_SMT | 7163 | #ifdef CONFIG_SCHED_SMT |
7233 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7164 | { sd_init_SIBLING, cpu_smt_mask, }, |
7234 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7235 | topology_thread_cpumask(cpu)); | ||
7236 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7237 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7238 | &cpu_to_cpu_group, | ||
7239 | d->send_covered, d->tmpmask); | ||
7240 | break; | ||
7241 | #endif | 7165 | #endif |
7242 | #ifdef CONFIG_SCHED_MC | 7166 | #ifdef CONFIG_SCHED_MC |
7243 | case SD_LV_MC: /* set up multi-core groups */ | 7167 | { sd_init_MC, cpu_coregroup_mask, }, |
7244 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7245 | if (cpu == cpumask_first(d->this_core_map)) | ||
7246 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7247 | &cpu_to_core_group, | ||
7248 | d->send_covered, d->tmpmask); | ||
7249 | break; | ||
7250 | #endif | 7168 | #endif |
7251 | #ifdef CONFIG_SCHED_BOOK | 7169 | #ifdef CONFIG_SCHED_BOOK |
7252 | case SD_LV_BOOK: /* set up book groups */ | 7170 | { sd_init_BOOK, cpu_book_mask, }, |
7253 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7254 | if (cpu == cpumask_first(d->this_book_map)) | ||
7255 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7256 | &cpu_to_book_group, | ||
7257 | d->send_covered, d->tmpmask); | ||
7258 | break; | ||
7259 | #endif | 7171 | #endif |
7260 | case SD_LV_CPU: /* set up physical groups */ | 7172 | { sd_init_CPU, cpu_cpu_mask, }, |
7261 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | ||
7262 | if (!cpumask_empty(d->nodemask)) | ||
7263 | init_sched_build_groups(d->nodemask, cpu_map, | ||
7264 | &cpu_to_phys_group, | ||
7265 | d->send_covered, d->tmpmask); | ||
7266 | break; | ||
7267 | #ifdef CONFIG_NUMA | 7173 | #ifdef CONFIG_NUMA |
7268 | case SD_LV_ALLNODES: | 7174 | { sd_init_NODE, cpu_node_mask, }, |
7269 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7175 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7270 | d->send_covered, d->tmpmask); | ||
7271 | break; | ||
7272 | #endif | 7176 | #endif |
7273 | default: | 7177 | { NULL, }, |
7274 | break; | 7178 | }; |
7179 | |||
7180 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7181 | |||
7182 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7183 | { | ||
7184 | struct sched_domain_topology_level *tl; | ||
7185 | int j; | ||
7186 | |||
7187 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7188 | struct sd_data *sdd = &tl->data; | ||
7189 | |||
7190 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7191 | if (!sdd->sd) | ||
7192 | return -ENOMEM; | ||
7193 | |||
7194 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7195 | if (!sdd->sg) | ||
7196 | return -ENOMEM; | ||
7197 | |||
7198 | for_each_cpu(j, cpu_map) { | ||
7199 | struct sched_domain *sd; | ||
7200 | struct sched_group *sg; | ||
7201 | |||
7202 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7203 | GFP_KERNEL, cpu_to_node(j)); | ||
7204 | if (!sd) | ||
7205 | return -ENOMEM; | ||
7206 | |||
7207 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7208 | |||
7209 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7210 | GFP_KERNEL, cpu_to_node(j)); | ||
7211 | if (!sg) | ||
7212 | return -ENOMEM; | ||
7213 | |||
7214 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7215 | } | ||
7216 | } | ||
7217 | |||
7218 | return 0; | ||
7219 | } | ||
7220 | |||
7221 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7222 | { | ||
7223 | struct sched_domain_topology_level *tl; | ||
7224 | int j; | ||
7225 | |||
7226 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7227 | struct sd_data *sdd = &tl->data; | ||
7228 | |||
7229 | for_each_cpu(j, cpu_map) { | ||
7230 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7231 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7232 | } | ||
7233 | free_percpu(sdd->sd); | ||
7234 | free_percpu(sdd->sg); | ||
7275 | } | 7235 | } |
7276 | } | 7236 | } |
7277 | 7237 | ||
7238 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7239 | struct s_data *d, const struct cpumask *cpu_map, | ||
7240 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7241 | int cpu) | ||
7242 | { | ||
7243 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7244 | if (!sd) | ||
7245 | return child; | ||
7246 | |||
7247 | set_domain_attribute(sd, attr); | ||
7248 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7249 | if (child) { | ||
7250 | sd->level = child->level + 1; | ||
7251 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7252 | child->parent = sd; | ||
7253 | } | ||
7254 | sd->child = child; | ||
7255 | |||
7256 | return sd; | ||
7257 | } | ||
7258 | |||
7278 | /* | 7259 | /* |
7279 | * Build sched domains for a given set of cpus and attach the sched domains | 7260 | * Build sched domains for a given set of cpus and attach the sched domains |
7280 | * to the individual cpus | 7261 | * to the individual cpus |
7281 | */ | 7262 | */ |
7282 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7263 | static int build_sched_domains(const struct cpumask *cpu_map, |
7283 | struct sched_domain_attr *attr) | 7264 | struct sched_domain_attr *attr) |
7284 | { | 7265 | { |
7285 | enum s_alloc alloc_state = sa_none; | 7266 | enum s_alloc alloc_state = sa_none; |
7286 | struct s_data d; | ||
7287 | struct sched_domain *sd; | 7267 | struct sched_domain *sd; |
7288 | int i; | 7268 | struct s_data d; |
7289 | #ifdef CONFIG_NUMA | 7269 | int i, ret = -ENOMEM; |
7290 | d.sd_allnodes = 0; | ||
7291 | #endif | ||
7292 | 7270 | ||
7293 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7271 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7294 | if (alloc_state != sa_rootdomain) | 7272 | if (alloc_state != sa_rootdomain) |
7295 | goto error; | 7273 | goto error; |
7296 | alloc_state = sa_sched_groups; | ||
7297 | 7274 | ||
7298 | /* | 7275 | /* Set up domains for cpus specified by the cpu_map. */ |
7299 | * Set up domains for cpus specified by the cpu_map. | ||
7300 | */ | ||
7301 | for_each_cpu(i, cpu_map) { | 7276 | for_each_cpu(i, cpu_map) { |
7302 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | 7277 | struct sched_domain_topology_level *tl; |
7303 | cpu_map); | ||
7304 | |||
7305 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | ||
7306 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | ||
7307 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
7308 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7309 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7310 | } | ||
7311 | |||
7312 | for_each_cpu(i, cpu_map) { | ||
7313 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | ||
7314 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7315 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
7316 | } | ||
7317 | 7278 | ||
7318 | /* Set up physical groups */ | 7279 | sd = NULL; |
7319 | for (i = 0; i < nr_node_ids; i++) | 7280 | for (tl = sched_domain_topology; tl->init; tl++) |
7320 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | 7281 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7321 | 7282 | ||
7322 | #ifdef CONFIG_NUMA | 7283 | while (sd->child) |
7323 | /* Set up node groups */ | 7284 | sd = sd->child; |
7324 | if (d.sd_allnodes) | ||
7325 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7326 | |||
7327 | for (i = 0; i < nr_node_ids; i++) | ||
7328 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
7329 | goto error; | ||
7330 | #endif | ||
7331 | 7285 | ||
7332 | /* Calculate CPU power for physical packages and nodes */ | 7286 | *per_cpu_ptr(d.sd, i) = sd; |
7333 | #ifdef CONFIG_SCHED_SMT | ||
7334 | for_each_cpu(i, cpu_map) { | ||
7335 | sd = &per_cpu(cpu_domains, i).sd; | ||
7336 | init_sched_groups_power(i, sd); | ||
7337 | } | ||
7338 | #endif | ||
7339 | #ifdef CONFIG_SCHED_MC | ||
7340 | for_each_cpu(i, cpu_map) { | ||
7341 | sd = &per_cpu(core_domains, i).sd; | ||
7342 | init_sched_groups_power(i, sd); | ||
7343 | } | ||
7344 | #endif | ||
7345 | #ifdef CONFIG_SCHED_BOOK | ||
7346 | for_each_cpu(i, cpu_map) { | ||
7347 | sd = &per_cpu(book_domains, i).sd; | ||
7348 | init_sched_groups_power(i, sd); | ||
7349 | } | 7287 | } |
7350 | #endif | ||
7351 | 7288 | ||
7289 | /* Build the groups for the domains */ | ||
7352 | for_each_cpu(i, cpu_map) { | 7290 | for_each_cpu(i, cpu_map) { |
7353 | sd = &per_cpu(phys_domains, i).sd; | 7291 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7354 | init_sched_groups_power(i, sd); | 7292 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7355 | } | 7293 | get_group(i, sd->private, &sd->groups); |
7294 | atomic_inc(&sd->groups->ref); | ||
7356 | 7295 | ||
7357 | #ifdef CONFIG_NUMA | 7296 | if (i != cpumask_first(sched_domain_span(sd))) |
7358 | for (i = 0; i < nr_node_ids; i++) | 7297 | continue; |
7359 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | ||
7360 | 7298 | ||
7361 | if (d.sd_allnodes) { | 7299 | build_sched_groups(sd); |
7362 | struct sched_group *sg; | 7300 | } |
7301 | } | ||
7363 | 7302 | ||
7364 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7303 | /* Calculate CPU power for physical packages and nodes */ |
7365 | d.tmpmask); | 7304 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7366 | init_numa_sched_groups_power(sg); | 7305 | if (!cpumask_test_cpu(i, cpu_map)) |
7306 | continue; | ||
7307 | |||
7308 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7309 | claim_allocations(i, sd); | ||
7310 | init_sched_groups_power(i, sd); | ||
7311 | } | ||
7367 | } | 7312 | } |
7368 | #endif | ||
7369 | 7313 | ||
7370 | /* Attach the domains */ | 7314 | /* Attach the domains */ |
7315 | rcu_read_lock(); | ||
7371 | for_each_cpu(i, cpu_map) { | 7316 | for_each_cpu(i, cpu_map) { |
7372 | #ifdef CONFIG_SCHED_SMT | 7317 | sd = *per_cpu_ptr(d.sd, i); |
7373 | sd = &per_cpu(cpu_domains, i).sd; | ||
7374 | #elif defined(CONFIG_SCHED_MC) | ||
7375 | sd = &per_cpu(core_domains, i).sd; | ||
7376 | #elif defined(CONFIG_SCHED_BOOK) | ||
7377 | sd = &per_cpu(book_domains, i).sd; | ||
7378 | #else | ||
7379 | sd = &per_cpu(phys_domains, i).sd; | ||
7380 | #endif | ||
7381 | cpu_attach_domain(sd, d.rd, i); | 7318 | cpu_attach_domain(sd, d.rd, i); |
7382 | } | 7319 | } |
7320 | rcu_read_unlock(); | ||
7383 | 7321 | ||
7384 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7322 | ret = 0; |
7385 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7386 | return 0; | ||
7387 | |||
7388 | error: | 7323 | error: |
7389 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7324 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7390 | return -ENOMEM; | 7325 | return ret; |
7391 | } | ||
7392 | |||
7393 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7394 | { | ||
7395 | return __build_sched_domains(cpu_map, NULL); | ||
7396 | } | 7326 | } |
7397 | 7327 | ||
7398 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7328 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7447,7 +7377,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7447 | * For now this just excludes isolated cpus, but could be used to | 7377 | * For now this just excludes isolated cpus, but could be used to |
7448 | * exclude other special cases in the future. | 7378 | * exclude other special cases in the future. |
7449 | */ | 7379 | */ |
7450 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7380 | static int init_sched_domains(const struct cpumask *cpu_map) |
7451 | { | 7381 | { |
7452 | int err; | 7382 | int err; |
7453 | 7383 | ||
@@ -7458,32 +7388,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7458 | doms_cur = &fallback_doms; | 7388 | doms_cur = &fallback_doms; |
7459 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7389 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7460 | dattr_cur = NULL; | 7390 | dattr_cur = NULL; |
7461 | err = build_sched_domains(doms_cur[0]); | 7391 | err = build_sched_domains(doms_cur[0], NULL); |
7462 | register_sched_domain_sysctl(); | 7392 | register_sched_domain_sysctl(); |
7463 | 7393 | ||
7464 | return err; | 7394 | return err; |
7465 | } | 7395 | } |
7466 | 7396 | ||
7467 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7468 | struct cpumask *tmpmask) | ||
7469 | { | ||
7470 | free_sched_groups(cpu_map, tmpmask); | ||
7471 | } | ||
7472 | |||
7473 | /* | 7397 | /* |
7474 | * Detach sched domains from a group of cpus specified in cpu_map | 7398 | * Detach sched domains from a group of cpus specified in cpu_map |
7475 | * These cpus will now be attached to the NULL domain | 7399 | * These cpus will now be attached to the NULL domain |
7476 | */ | 7400 | */ |
7477 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7401 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7478 | { | 7402 | { |
7479 | /* Save because hotplug lock held. */ | ||
7480 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7481 | int i; | 7403 | int i; |
7482 | 7404 | ||
7405 | rcu_read_lock(); | ||
7483 | for_each_cpu(i, cpu_map) | 7406 | for_each_cpu(i, cpu_map) |
7484 | cpu_attach_domain(NULL, &def_root_domain, i); | 7407 | cpu_attach_domain(NULL, &def_root_domain, i); |
7485 | synchronize_sched(); | 7408 | rcu_read_unlock(); |
7486 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7487 | } | 7409 | } |
7488 | 7410 | ||
7489 | /* handle null as "default" */ | 7411 | /* handle null as "default" */ |
@@ -7572,8 +7494,7 @@ match1: | |||
7572 | goto match2; | 7494 | goto match2; |
7573 | } | 7495 | } |
7574 | /* no match - add a new doms_new */ | 7496 | /* no match - add a new doms_new */ |
7575 | __build_sched_domains(doms_new[i], | 7497 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7576 | dattr_new ? dattr_new + i : NULL); | ||
7577 | match2: | 7498 | match2: |
7578 | ; | 7499 | ; |
7579 | } | 7500 | } |
@@ -7592,7 +7513,7 @@ match2: | |||
7592 | } | 7513 | } |
7593 | 7514 | ||
7594 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7515 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7595 | static void arch_reinit_sched_domains(void) | 7516 | static void reinit_sched_domains(void) |
7596 | { | 7517 | { |
7597 | get_online_cpus(); | 7518 | get_online_cpus(); |
7598 | 7519 | ||
@@ -7625,7 +7546,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7625 | else | 7546 | else |
7626 | sched_mc_power_savings = level; | 7547 | sched_mc_power_savings = level; |
7627 | 7548 | ||
7628 | arch_reinit_sched_domains(); | 7549 | reinit_sched_domains(); |
7629 | 7550 | ||
7630 | return count; | 7551 | return count; |
7631 | } | 7552 | } |
@@ -7744,14 +7665,9 @@ void __init sched_init_smp(void) | |||
7744 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7665 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7745 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7666 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7746 | 7667 | ||
7747 | #if defined(CONFIG_NUMA) | ||
7748 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7749 | GFP_KERNEL); | ||
7750 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7751 | #endif | ||
7752 | get_online_cpus(); | 7668 | get_online_cpus(); |
7753 | mutex_lock(&sched_domains_mutex); | 7669 | mutex_lock(&sched_domains_mutex); |
7754 | arch_init_sched_domains(cpu_active_mask); | 7670 | init_sched_domains(cpu_active_mask); |
7755 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7671 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7756 | if (cpumask_empty(non_isolated_cpus)) | 7672 | if (cpumask_empty(non_isolated_cpus)) |
7757 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7673 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -7796,6 +7712,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7796 | INIT_LIST_HEAD(&cfs_rq->tasks); | 7712 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7797 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7713 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7798 | cfs_rq->rq = rq; | 7714 | cfs_rq->rq = rq; |
7715 | /* allow initial update_cfs_load() to truncate */ | ||
7716 | #ifdef CONFIG_SMP | ||
7717 | cfs_rq->load_stamp = 1; | ||
7718 | #endif | ||
7799 | #endif | 7719 | #endif |
7800 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7720 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7801 | } | 7721 | } |
@@ -7997,7 +7917,7 @@ void __init sched_init(void) | |||
7997 | #ifdef CONFIG_SMP | 7917 | #ifdef CONFIG_SMP |
7998 | rq->sd = NULL; | 7918 | rq->sd = NULL; |
7999 | rq->rd = NULL; | 7919 | rq->rd = NULL; |
8000 | rq->cpu_power = SCHED_LOAD_SCALE; | 7920 | rq->cpu_power = SCHED_POWER_SCALE; |
8001 | rq->post_schedule = 0; | 7921 | rq->post_schedule = 0; |
8002 | rq->active_balance = 0; | 7922 | rq->active_balance = 0; |
8003 | rq->next_balance = jiffies; | 7923 | rq->next_balance = jiffies; |
@@ -8054,6 +7974,7 @@ void __init sched_init(void) | |||
8054 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 7974 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
8055 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7975 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
8056 | #ifdef CONFIG_SMP | 7976 | #ifdef CONFIG_SMP |
7977 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
8057 | #ifdef CONFIG_NO_HZ | 7978 | #ifdef CONFIG_NO_HZ |
8058 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 7979 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
8059 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 7980 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -8074,7 +7995,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
8074 | { | 7995 | { |
8075 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 7996 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
8076 | 7997 | ||
8077 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 7998 | return (nested == preempt_offset); |
8078 | } | 7999 | } |
8079 | 8000 | ||
8080 | void __might_sleep(const char *file, int line, int preempt_offset) | 8001 | void __might_sleep(const char *file, int line, int preempt_offset) |
@@ -8109,9 +8030,11 @@ EXPORT_SYMBOL(__might_sleep); | |||
8109 | #ifdef CONFIG_MAGIC_SYSRQ | 8030 | #ifdef CONFIG_MAGIC_SYSRQ |
8110 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8031 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8111 | { | 8032 | { |
8033 | const struct sched_class *prev_class = p->sched_class; | ||
8034 | int old_prio = p->prio; | ||
8112 | int on_rq; | 8035 | int on_rq; |
8113 | 8036 | ||
8114 | on_rq = p->se.on_rq; | 8037 | on_rq = p->on_rq; |
8115 | if (on_rq) | 8038 | if (on_rq) |
8116 | deactivate_task(rq, p, 0); | 8039 | deactivate_task(rq, p, 0); |
8117 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8040 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8119,6 +8042,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8119 | activate_task(rq, p, 0); | 8042 | activate_task(rq, p, 0); |
8120 | resched_task(rq->curr); | 8043 | resched_task(rq->curr); |
8121 | } | 8044 | } |
8045 | |||
8046 | check_class_changed(rq, p, prev_class, old_prio); | ||
8122 | } | 8047 | } |
8123 | 8048 | ||
8124 | void normalize_rt_tasks(void) | 8049 | void normalize_rt_tasks(void) |
@@ -8234,7 +8159,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8234 | { | 8159 | { |
8235 | struct cfs_rq *cfs_rq; | 8160 | struct cfs_rq *cfs_rq; |
8236 | struct sched_entity *se; | 8161 | struct sched_entity *se; |
8237 | struct rq *rq; | ||
8238 | int i; | 8162 | int i; |
8239 | 8163 | ||
8240 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8164 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8247,8 +8171,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8247 | tg->shares = NICE_0_LOAD; | 8171 | tg->shares = NICE_0_LOAD; |
8248 | 8172 | ||
8249 | for_each_possible_cpu(i) { | 8173 | for_each_possible_cpu(i) { |
8250 | rq = cpu_rq(i); | ||
8251 | |||
8252 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8174 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8253 | GFP_KERNEL, cpu_to_node(i)); | 8175 | GFP_KERNEL, cpu_to_node(i)); |
8254 | if (!cfs_rq) | 8176 | if (!cfs_rq) |
@@ -8325,7 +8247,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8325 | { | 8247 | { |
8326 | struct rt_rq *rt_rq; | 8248 | struct rt_rq *rt_rq; |
8327 | struct sched_rt_entity *rt_se; | 8249 | struct sched_rt_entity *rt_se; |
8328 | struct rq *rq; | ||
8329 | int i; | 8250 | int i; |
8330 | 8251 | ||
8331 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8252 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8339,8 +8260,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8339 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8260 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8340 | 8261 | ||
8341 | for_each_possible_cpu(i) { | 8262 | for_each_possible_cpu(i) { |
8342 | rq = cpu_rq(i); | ||
8343 | |||
8344 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8263 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8345 | GFP_KERNEL, cpu_to_node(i)); | 8264 | GFP_KERNEL, cpu_to_node(i)); |
8346 | if (!rt_rq) | 8265 | if (!rt_rq) |
@@ -8455,7 +8374,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8455 | rq = task_rq_lock(tsk, &flags); | 8374 | rq = task_rq_lock(tsk, &flags); |
8456 | 8375 | ||
8457 | running = task_current(rq, tsk); | 8376 | running = task_current(rq, tsk); |
8458 | on_rq = tsk->se.on_rq; | 8377 | on_rq = tsk->on_rq; |
8459 | 8378 | ||
8460 | if (on_rq) | 8379 | if (on_rq) |
8461 | dequeue_task(rq, tsk, 0); | 8380 | dequeue_task(rq, tsk, 0); |
@@ -8474,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8474 | if (on_rq) | 8393 | if (on_rq) |
8475 | enqueue_task(rq, tsk, 0); | 8394 | enqueue_task(rq, tsk, 0); |
8476 | 8395 | ||
8477 | task_rq_unlock(rq, &flags); | 8396 | task_rq_unlock(rq, tsk, &flags); |
8478 | } | 8397 | } |
8479 | #endif /* CONFIG_CGROUP_SCHED */ | 8398 | #endif /* CONFIG_CGROUP_SCHED */ |
8480 | 8399 | ||
@@ -8510,7 +8429,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8510 | /* Propagate contribution to hierarchy */ | 8429 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | 8430 | raw_spin_lock_irqsave(&rq->lock, flags); |
8512 | for_each_sched_entity(se) | 8431 | for_each_sched_entity(se) |
8513 | update_cfs_shares(group_cfs_rq(se), 0); | 8432 | update_cfs_shares(group_cfs_rq(se)); |
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8433 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8515 | } | 8434 | } |
8516 | 8435 | ||
@@ -8845,46 +8764,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
8845 | return 0; | 8764 | return 0; |
8846 | } | 8765 | } |
8847 | 8766 | ||
8848 | static int | ||
8849 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
8850 | struct task_struct *tsk, bool threadgroup) | ||
8851 | { | ||
8852 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
8853 | if (retval) | ||
8854 | return retval; | ||
8855 | if (threadgroup) { | ||
8856 | struct task_struct *c; | ||
8857 | rcu_read_lock(); | ||
8858 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
8859 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
8860 | if (retval) { | ||
8861 | rcu_read_unlock(); | ||
8862 | return retval; | ||
8863 | } | ||
8864 | } | ||
8865 | rcu_read_unlock(); | ||
8866 | } | ||
8867 | return 0; | ||
8868 | } | ||
8869 | |||
8870 | static void | 8767 | static void |
8871 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 8768 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8872 | struct cgroup *old_cont, struct task_struct *tsk, | ||
8873 | bool threadgroup) | ||
8874 | { | 8769 | { |
8875 | sched_move_task(tsk); | 8770 | sched_move_task(tsk); |
8876 | if (threadgroup) { | ||
8877 | struct task_struct *c; | ||
8878 | rcu_read_lock(); | ||
8879 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
8880 | sched_move_task(c); | ||
8881 | } | ||
8882 | rcu_read_unlock(); | ||
8883 | } | ||
8884 | } | 8771 | } |
8885 | 8772 | ||
8886 | static void | 8773 | static void |
8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | 8774 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8775 | struct cgroup *old_cgrp, struct task_struct *task) | ||
8888 | { | 8776 | { |
8889 | /* | 8777 | /* |
8890 | * cgroup_exit() is called in the copy_process() failure path. | 8778 | * cgroup_exit() is called in the copy_process() failure path. |
@@ -8901,14 +8789,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | |||
8901 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8789 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8902 | u64 shareval) | 8790 | u64 shareval) |
8903 | { | 8791 | { |
8904 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 8792 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
8905 | } | 8793 | } |
8906 | 8794 | ||
8907 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 8795 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
8908 | { | 8796 | { |
8909 | struct task_group *tg = cgroup_tg(cgrp); | 8797 | struct task_group *tg = cgroup_tg(cgrp); |
8910 | 8798 | ||
8911 | return (u64) tg->shares; | 8799 | return (u64) scale_load_down(tg->shares); |
8912 | } | 8800 | } |
8913 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8801 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8914 | 8802 | ||
@@ -8967,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8967 | .name = "cpu", | 8855 | .name = "cpu", |
8968 | .create = cpu_cgroup_create, | 8856 | .create = cpu_cgroup_create, |
8969 | .destroy = cpu_cgroup_destroy, | 8857 | .destroy = cpu_cgroup_destroy, |
8970 | .can_attach = cpu_cgroup_can_attach, | 8858 | .can_attach_task = cpu_cgroup_can_attach_task, |
8971 | .attach = cpu_cgroup_attach, | 8859 | .attach_task = cpu_cgroup_attach_task, |
8972 | .exit = cpu_cgroup_exit, | 8860 | .exit = cpu_cgroup_exit, |
8973 | .populate = cpu_cgroup_populate, | 8861 | .populate = cpu_cgroup_populate, |
8974 | .subsys_id = cpu_cgroup_subsys_id, | 8862 | .subsys_id = cpu_cgroup_subsys_id, |