diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1684 |
1 files changed, 1094 insertions, 590 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 0cdb50260dbf..d897a524e7d8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -70,10 +70,13 @@ | |||
70 | #include <linux/bootmem.h> | 70 | #include <linux/bootmem.h> |
71 | #include <linux/debugfs.h> | 71 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 72 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | ||
73 | 74 | ||
74 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
75 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
76 | 77 | ||
78 | #include "sched_cpupri.h" | ||
79 | |||
77 | /* | 80 | /* |
78 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 81 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 82 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -198,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
198 | hrtimer_init(&rt_b->rt_period_timer, | 201 | hrtimer_init(&rt_b->rt_period_timer, |
199 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 202 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
200 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 203 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
201 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 204 | rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; |
205 | } | ||
206 | |||
207 | static inline int rt_bandwidth_enabled(void) | ||
208 | { | ||
209 | return sysctl_sched_rt_runtime >= 0; | ||
202 | } | 210 | } |
203 | 211 | ||
204 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 212 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) |
205 | { | 213 | { |
206 | ktime_t now; | 214 | ktime_t now; |
207 | 215 | ||
208 | if (rt_b->rt_runtime == RUNTIME_INF) | 216 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) |
209 | return; | 217 | return; |
210 | 218 | ||
211 | if (hrtimer_active(&rt_b->rt_period_timer)) | 219 | if (hrtimer_active(&rt_b->rt_period_timer)) |
@@ -289,15 +297,15 @@ struct task_group root_task_group; | |||
289 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 297 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
290 | /* Default task group's cfs_rq on each cpu */ | 298 | /* Default task group's cfs_rq on each cpu */ |
291 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 299 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
292 | #endif | 300 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
293 | 301 | ||
294 | #ifdef CONFIG_RT_GROUP_SCHED | 302 | #ifdef CONFIG_RT_GROUP_SCHED |
295 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 303 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
296 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 304 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
297 | #endif | 305 | #endif /* CONFIG_RT_GROUP_SCHED */ |
298 | #else | 306 | #else /* !CONFIG_USER_SCHED */ |
299 | #define root_task_group init_task_group | 307 | #define root_task_group init_task_group |
300 | #endif | 308 | #endif /* CONFIG_USER_SCHED */ |
301 | 309 | ||
302 | /* task_group_lock serializes add/remove of task groups and also changes to | 310 | /* task_group_lock serializes add/remove of task groups and also changes to |
303 | * a task group's cpu shares. | 311 | * a task group's cpu shares. |
@@ -307,9 +315,9 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
307 | #ifdef CONFIG_FAIR_GROUP_SCHED | 315 | #ifdef CONFIG_FAIR_GROUP_SCHED |
308 | #ifdef CONFIG_USER_SCHED | 316 | #ifdef CONFIG_USER_SCHED |
309 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 317 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
310 | #else | 318 | #else /* !CONFIG_USER_SCHED */ |
311 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 319 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
312 | #endif | 320 | #endif /* CONFIG_USER_SCHED */ |
313 | 321 | ||
314 | /* | 322 | /* |
315 | * A weight of 0 or 1 can cause arithmetics problems. | 323 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -363,6 +371,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
363 | #else | 371 | #else |
364 | 372 | ||
365 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 373 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
374 | static inline struct task_group *task_group(struct task_struct *p) | ||
375 | { | ||
376 | return NULL; | ||
377 | } | ||
366 | 378 | ||
367 | #endif /* CONFIG_GROUP_SCHED */ | 379 | #endif /* CONFIG_GROUP_SCHED */ |
368 | 380 | ||
@@ -373,6 +385,7 @@ struct cfs_rq { | |||
373 | 385 | ||
374 | u64 exec_clock; | 386 | u64 exec_clock; |
375 | u64 min_vruntime; | 387 | u64 min_vruntime; |
388 | u64 pair_start; | ||
376 | 389 | ||
377 | struct rb_root tasks_timeline; | 390 | struct rb_root tasks_timeline; |
378 | struct rb_node *rb_leftmost; | 391 | struct rb_node *rb_leftmost; |
@@ -401,6 +414,31 @@ struct cfs_rq { | |||
401 | */ | 414 | */ |
402 | struct list_head leaf_cfs_rq_list; | 415 | struct list_head leaf_cfs_rq_list; |
403 | struct task_group *tg; /* group that "owns" this runqueue */ | 416 | struct task_group *tg; /* group that "owns" this runqueue */ |
417 | |||
418 | #ifdef CONFIG_SMP | ||
419 | /* | ||
420 | * the part of load.weight contributed by tasks | ||
421 | */ | ||
422 | unsigned long task_weight; | ||
423 | |||
424 | /* | ||
425 | * h_load = weight * f(tg) | ||
426 | * | ||
427 | * Where f(tg) is the recursive weight fraction assigned to | ||
428 | * this group. | ||
429 | */ | ||
430 | unsigned long h_load; | ||
431 | |||
432 | /* | ||
433 | * this cpu's part of tg->shares | ||
434 | */ | ||
435 | unsigned long shares; | ||
436 | |||
437 | /* | ||
438 | * load.weight at the time we set shares | ||
439 | */ | ||
440 | unsigned long rq_weight; | ||
441 | #endif | ||
404 | #endif | 442 | #endif |
405 | }; | 443 | }; |
406 | 444 | ||
@@ -452,6 +490,9 @@ struct root_domain { | |||
452 | */ | 490 | */ |
453 | cpumask_t rto_mask; | 491 | cpumask_t rto_mask; |
454 | atomic_t rto_count; | 492 | atomic_t rto_count; |
493 | #ifdef CONFIG_SMP | ||
494 | struct cpupri cpupri; | ||
495 | #endif | ||
455 | }; | 496 | }; |
456 | 497 | ||
457 | /* | 498 | /* |
@@ -526,14 +567,19 @@ struct rq { | |||
526 | int push_cpu; | 567 | int push_cpu; |
527 | /* cpu of this runqueue: */ | 568 | /* cpu of this runqueue: */ |
528 | int cpu; | 569 | int cpu; |
570 | int online; | ||
571 | |||
572 | unsigned long avg_load_per_task; | ||
529 | 573 | ||
530 | struct task_struct *migration_thread; | 574 | struct task_struct *migration_thread; |
531 | struct list_head migration_queue; | 575 | struct list_head migration_queue; |
532 | #endif | 576 | #endif |
533 | 577 | ||
534 | #ifdef CONFIG_SCHED_HRTICK | 578 | #ifdef CONFIG_SCHED_HRTICK |
535 | unsigned long hrtick_flags; | 579 | #ifdef CONFIG_SMP |
536 | ktime_t hrtick_expire; | 580 | int hrtick_csd_pending; |
581 | struct call_single_data hrtick_csd; | ||
582 | #endif | ||
537 | struct hrtimer hrtick_timer; | 583 | struct hrtimer hrtick_timer; |
538 | #endif | 584 | #endif |
539 | 585 | ||
@@ -559,14 +605,13 @@ struct rq { | |||
559 | /* BKL stats */ | 605 | /* BKL stats */ |
560 | unsigned int bkl_count; | 606 | unsigned int bkl_count; |
561 | #endif | 607 | #endif |
562 | struct lock_class_key rq_lock_key; | ||
563 | }; | 608 | }; |
564 | 609 | ||
565 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 610 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
566 | 611 | ||
567 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 612 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) |
568 | { | 613 | { |
569 | rq->curr->sched_class->check_preempt_curr(rq, p); | 614 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); |
570 | } | 615 | } |
571 | 616 | ||
572 | static inline int cpu_of(struct rq *rq) | 617 | static inline int cpu_of(struct rq *rq) |
@@ -607,6 +652,24 @@ static inline void update_rq_clock(struct rq *rq) | |||
607 | # define const_debug static const | 652 | # define const_debug static const |
608 | #endif | 653 | #endif |
609 | 654 | ||
655 | /** | ||
656 | * runqueue_is_locked | ||
657 | * | ||
658 | * Returns true if the current cpu runqueue is locked. | ||
659 | * This interface allows printk to be called with the runqueue lock | ||
660 | * held and know whether or not it is OK to wake up the klogd. | ||
661 | */ | ||
662 | int runqueue_is_locked(void) | ||
663 | { | ||
664 | int cpu = get_cpu(); | ||
665 | struct rq *rq = cpu_rq(cpu); | ||
666 | int ret; | ||
667 | |||
668 | ret = spin_is_locked(&rq->lock); | ||
669 | put_cpu(); | ||
670 | return ret; | ||
671 | } | ||
672 | |||
610 | /* | 673 | /* |
611 | * Debugging: various feature bits | 674 | * Debugging: various feature bits |
612 | */ | 675 | */ |
@@ -749,6 +812,12 @@ late_initcall(sched_init_debug); | |||
749 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 812 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
750 | 813 | ||
751 | /* | 814 | /* |
815 | * ratelimit for updating the group shares. | ||
816 | * default: 0.25ms | ||
817 | */ | ||
818 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
819 | |||
820 | /* | ||
752 | * period over which we measure -rt task cpu usage in us. | 821 | * period over which we measure -rt task cpu usage in us. |
753 | * default: 1s | 822 | * default: 1s |
754 | */ | 823 | */ |
@@ -769,88 +838,12 @@ static inline u64 global_rt_period(void) | |||
769 | 838 | ||
770 | static inline u64 global_rt_runtime(void) | 839 | static inline u64 global_rt_runtime(void) |
771 | { | 840 | { |
772 | if (sysctl_sched_rt_period < 0) | 841 | if (sysctl_sched_rt_runtime < 0) |
773 | return RUNTIME_INF; | 842 | return RUNTIME_INF; |
774 | 843 | ||
775 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 844 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
776 | } | 845 | } |
777 | 846 | ||
778 | unsigned long long time_sync_thresh = 100000; | ||
779 | |||
780 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
781 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
782 | |||
783 | /* | ||
784 | * Global lock which we take every now and then to synchronize | ||
785 | * the CPUs time. This method is not warp-safe, but it's good | ||
786 | * enough to synchronize slowly diverging time sources and thus | ||
787 | * it's good enough for tracing: | ||
788 | */ | ||
789 | static DEFINE_SPINLOCK(time_sync_lock); | ||
790 | static unsigned long long prev_global_time; | ||
791 | |||
792 | static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) | ||
793 | { | ||
794 | /* | ||
795 | * We want this inlined, to not get tracer function calls | ||
796 | * in this critical section: | ||
797 | */ | ||
798 | spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); | ||
799 | __raw_spin_lock(&time_sync_lock.raw_lock); | ||
800 | |||
801 | if (time < prev_global_time) { | ||
802 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
803 | time = prev_global_time; | ||
804 | } else { | ||
805 | prev_global_time = time; | ||
806 | } | ||
807 | |||
808 | __raw_spin_unlock(&time_sync_lock.raw_lock); | ||
809 | spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); | ||
810 | |||
811 | return time; | ||
812 | } | ||
813 | |||
814 | static unsigned long long __cpu_clock(int cpu) | ||
815 | { | ||
816 | unsigned long long now; | ||
817 | |||
818 | /* | ||
819 | * Only call sched_clock() if the scheduler has already been | ||
820 | * initialized (some code might call cpu_clock() very early): | ||
821 | */ | ||
822 | if (unlikely(!scheduler_running)) | ||
823 | return 0; | ||
824 | |||
825 | now = sched_clock_cpu(cpu); | ||
826 | |||
827 | return now; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
832 | * clock constructed from sched_clock(): | ||
833 | */ | ||
834 | unsigned long long cpu_clock(int cpu) | ||
835 | { | ||
836 | unsigned long long prev_cpu_time, time, delta_time; | ||
837 | unsigned long flags; | ||
838 | |||
839 | local_irq_save(flags); | ||
840 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
841 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
842 | delta_time = time-prev_cpu_time; | ||
843 | |||
844 | if (unlikely(delta_time > time_sync_thresh)) { | ||
845 | time = __sync_cpu_clock(time, cpu); | ||
846 | per_cpu(prev_cpu_time, cpu) = time; | ||
847 | } | ||
848 | local_irq_restore(flags); | ||
849 | |||
850 | return time; | ||
851 | } | ||
852 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
853 | |||
854 | #ifndef prepare_arch_switch | 847 | #ifndef prepare_arch_switch |
855 | # define prepare_arch_switch(next) do { } while (0) | 848 | # define prepare_arch_switch(next) do { } while (0) |
856 | #endif | 849 | #endif |
@@ -996,13 +989,6 @@ static struct rq *this_rq_lock(void) | |||
996 | return rq; | 989 | return rq; |
997 | } | 990 | } |
998 | 991 | ||
999 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
1000 | |||
1001 | static inline void resched_task(struct task_struct *p) | ||
1002 | { | ||
1003 | __resched_task(p, TIF_NEED_RESCHED); | ||
1004 | } | ||
1005 | |||
1006 | #ifdef CONFIG_SCHED_HRTICK | 992 | #ifdef CONFIG_SCHED_HRTICK |
1007 | /* | 993 | /* |
1008 | * Use HR-timers to deliver accurate preemption points. | 994 | * Use HR-timers to deliver accurate preemption points. |
@@ -1014,25 +1000,6 @@ static inline void resched_task(struct task_struct *p) | |||
1014 | * When we get rescheduled we reprogram the hrtick_timer outside of the | 1000 | * When we get rescheduled we reprogram the hrtick_timer outside of the |
1015 | * rq->lock. | 1001 | * rq->lock. |
1016 | */ | 1002 | */ |
1017 | static inline void resched_hrt(struct task_struct *p) | ||
1018 | { | ||
1019 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
1020 | } | ||
1021 | |||
1022 | static inline void resched_rq(struct rq *rq) | ||
1023 | { | ||
1024 | unsigned long flags; | ||
1025 | |||
1026 | spin_lock_irqsave(&rq->lock, flags); | ||
1027 | resched_task(rq->curr); | ||
1028 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1029 | } | ||
1030 | |||
1031 | enum { | ||
1032 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
1033 | HRTICK_RESET, /* not a new slice */ | ||
1034 | HRTICK_BLOCK, /* stop hrtick operations */ | ||
1035 | }; | ||
1036 | 1003 | ||
1037 | /* | 1004 | /* |
1038 | * Use hrtick when: | 1005 | * Use hrtick when: |
@@ -1043,40 +1010,11 @@ static inline int hrtick_enabled(struct rq *rq) | |||
1043 | { | 1010 | { |
1044 | if (!sched_feat(HRTICK)) | 1011 | if (!sched_feat(HRTICK)) |
1045 | return 0; | 1012 | return 0; |
1046 | if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) | 1013 | if (!cpu_active(cpu_of(rq))) |
1047 | return 0; | 1014 | return 0; |
1048 | return hrtimer_is_hres_active(&rq->hrtick_timer); | 1015 | return hrtimer_is_hres_active(&rq->hrtick_timer); |
1049 | } | 1016 | } |
1050 | 1017 | ||
1051 | /* | ||
1052 | * Called to set the hrtick timer state. | ||
1053 | * | ||
1054 | * called with rq->lock held and irqs disabled | ||
1055 | */ | ||
1056 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
1057 | { | ||
1058 | assert_spin_locked(&rq->lock); | ||
1059 | |||
1060 | /* | ||
1061 | * preempt at: now + delay | ||
1062 | */ | ||
1063 | rq->hrtick_expire = | ||
1064 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
1065 | /* | ||
1066 | * indicate we need to program the timer | ||
1067 | */ | ||
1068 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
1069 | if (reset) | ||
1070 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
1071 | |||
1072 | /* | ||
1073 | * New slices are called from the schedule path and don't need a | ||
1074 | * forced reschedule. | ||
1075 | */ | ||
1076 | if (reset) | ||
1077 | resched_hrt(rq->curr); | ||
1078 | } | ||
1079 | |||
1080 | static void hrtick_clear(struct rq *rq) | 1018 | static void hrtick_clear(struct rq *rq) |
1081 | { | 1019 | { |
1082 | if (hrtimer_active(&rq->hrtick_timer)) | 1020 | if (hrtimer_active(&rq->hrtick_timer)) |
@@ -1084,32 +1022,6 @@ static void hrtick_clear(struct rq *rq) | |||
1084 | } | 1022 | } |
1085 | 1023 | ||
1086 | /* | 1024 | /* |
1087 | * Update the timer from the possible pending state. | ||
1088 | */ | ||
1089 | static void hrtick_set(struct rq *rq) | ||
1090 | { | ||
1091 | ktime_t time; | ||
1092 | int set, reset; | ||
1093 | unsigned long flags; | ||
1094 | |||
1095 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
1096 | |||
1097 | spin_lock_irqsave(&rq->lock, flags); | ||
1098 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
1099 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
1100 | time = rq->hrtick_expire; | ||
1101 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
1102 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1103 | |||
1104 | if (set) { | ||
1105 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
1106 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
1107 | resched_rq(rq); | ||
1108 | } else | ||
1109 | hrtick_clear(rq); | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * High-resolution timer tick. | 1025 | * High-resolution timer tick. |
1114 | * Runs from hardirq context with interrupts disabled. | 1026 | * Runs from hardirq context with interrupts disabled. |
1115 | */ | 1027 | */ |
@@ -1128,27 +1040,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) | |||
1128 | } | 1040 | } |
1129 | 1041 | ||
1130 | #ifdef CONFIG_SMP | 1042 | #ifdef CONFIG_SMP |
1131 | static void hotplug_hrtick_disable(int cpu) | 1043 | /* |
1044 | * called from hardirq (IPI) context | ||
1045 | */ | ||
1046 | static void __hrtick_start(void *arg) | ||
1132 | { | 1047 | { |
1133 | struct rq *rq = cpu_rq(cpu); | 1048 | struct rq *rq = arg; |
1134 | unsigned long flags; | ||
1135 | 1049 | ||
1136 | spin_lock_irqsave(&rq->lock, flags); | 1050 | spin_lock(&rq->lock); |
1137 | rq->hrtick_flags = 0; | 1051 | hrtimer_restart(&rq->hrtick_timer); |
1138 | __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); | 1052 | rq->hrtick_csd_pending = 0; |
1139 | spin_unlock_irqrestore(&rq->lock, flags); | 1053 | spin_unlock(&rq->lock); |
1140 | |||
1141 | hrtick_clear(rq); | ||
1142 | } | 1054 | } |
1143 | 1055 | ||
1144 | static void hotplug_hrtick_enable(int cpu) | 1056 | /* |
1057 | * Called to set the hrtick timer state. | ||
1058 | * | ||
1059 | * called with rq->lock held and irqs disabled | ||
1060 | */ | ||
1061 | static void hrtick_start(struct rq *rq, u64 delay) | ||
1145 | { | 1062 | { |
1146 | struct rq *rq = cpu_rq(cpu); | 1063 | struct hrtimer *timer = &rq->hrtick_timer; |
1147 | unsigned long flags; | 1064 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
1148 | 1065 | ||
1149 | spin_lock_irqsave(&rq->lock, flags); | 1066 | timer->expires = time; |
1150 | __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); | 1067 | |
1151 | spin_unlock_irqrestore(&rq->lock, flags); | 1068 | if (rq == this_rq()) { |
1069 | hrtimer_restart(timer); | ||
1070 | } else if (!rq->hrtick_csd_pending) { | ||
1071 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); | ||
1072 | rq->hrtick_csd_pending = 1; | ||
1073 | } | ||
1152 | } | 1074 | } |
1153 | 1075 | ||
1154 | static int | 1076 | static int |
@@ -1163,70 +1085,60 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
1163 | case CPU_DOWN_PREPARE_FROZEN: | 1085 | case CPU_DOWN_PREPARE_FROZEN: |
1164 | case CPU_DEAD: | 1086 | case CPU_DEAD: |
1165 | case CPU_DEAD_FROZEN: | 1087 | case CPU_DEAD_FROZEN: |
1166 | hotplug_hrtick_disable(cpu); | 1088 | hrtick_clear(cpu_rq(cpu)); |
1167 | return NOTIFY_OK; | ||
1168 | |||
1169 | case CPU_UP_PREPARE: | ||
1170 | case CPU_UP_PREPARE_FROZEN: | ||
1171 | case CPU_DOWN_FAILED: | ||
1172 | case CPU_DOWN_FAILED_FROZEN: | ||
1173 | case CPU_ONLINE: | ||
1174 | case CPU_ONLINE_FROZEN: | ||
1175 | hotplug_hrtick_enable(cpu); | ||
1176 | return NOTIFY_OK; | 1089 | return NOTIFY_OK; |
1177 | } | 1090 | } |
1178 | 1091 | ||
1179 | return NOTIFY_DONE; | 1092 | return NOTIFY_DONE; |
1180 | } | 1093 | } |
1181 | 1094 | ||
1182 | static void init_hrtick(void) | 1095 | static __init void init_hrtick(void) |
1183 | { | 1096 | { |
1184 | hotcpu_notifier(hotplug_hrtick, 0); | 1097 | hotcpu_notifier(hotplug_hrtick, 0); |
1185 | } | 1098 | } |
1186 | #endif /* CONFIG_SMP */ | 1099 | #else |
1100 | /* | ||
1101 | * Called to set the hrtick timer state. | ||
1102 | * | ||
1103 | * called with rq->lock held and irqs disabled | ||
1104 | */ | ||
1105 | static void hrtick_start(struct rq *rq, u64 delay) | ||
1106 | { | ||
1107 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); | ||
1108 | } | ||
1187 | 1109 | ||
1188 | static void init_rq_hrtick(struct rq *rq) | 1110 | static inline void init_hrtick(void) |
1189 | { | 1111 | { |
1190 | rq->hrtick_flags = 0; | ||
1191 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1192 | rq->hrtick_timer.function = hrtick; | ||
1193 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1194 | } | 1112 | } |
1113 | #endif /* CONFIG_SMP */ | ||
1195 | 1114 | ||
1196 | void hrtick_resched(void) | 1115 | static void init_rq_hrtick(struct rq *rq) |
1197 | { | 1116 | { |
1198 | struct rq *rq; | 1117 | #ifdef CONFIG_SMP |
1199 | unsigned long flags; | 1118 | rq->hrtick_csd_pending = 0; |
1200 | 1119 | ||
1201 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | 1120 | rq->hrtick_csd.flags = 0; |
1202 | return; | 1121 | rq->hrtick_csd.func = __hrtick_start; |
1122 | rq->hrtick_csd.info = rq; | ||
1123 | #endif | ||
1203 | 1124 | ||
1204 | local_irq_save(flags); | 1125 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1205 | rq = cpu_rq(smp_processor_id()); | 1126 | rq->hrtick_timer.function = hrtick; |
1206 | hrtick_set(rq); | 1127 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; |
1207 | local_irq_restore(flags); | ||
1208 | } | 1128 | } |
1209 | #else | 1129 | #else /* CONFIG_SCHED_HRTICK */ |
1210 | static inline void hrtick_clear(struct rq *rq) | 1130 | static inline void hrtick_clear(struct rq *rq) |
1211 | { | 1131 | { |
1212 | } | 1132 | } |
1213 | 1133 | ||
1214 | static inline void hrtick_set(struct rq *rq) | ||
1215 | { | ||
1216 | } | ||
1217 | |||
1218 | static inline void init_rq_hrtick(struct rq *rq) | 1134 | static inline void init_rq_hrtick(struct rq *rq) |
1219 | { | 1135 | { |
1220 | } | 1136 | } |
1221 | 1137 | ||
1222 | void hrtick_resched(void) | ||
1223 | { | ||
1224 | } | ||
1225 | |||
1226 | static inline void init_hrtick(void) | 1138 | static inline void init_hrtick(void) |
1227 | { | 1139 | { |
1228 | } | 1140 | } |
1229 | #endif | 1141 | #endif /* CONFIG_SCHED_HRTICK */ |
1230 | 1142 | ||
1231 | /* | 1143 | /* |
1232 | * resched_task - mark a task 'to be rescheduled now'. | 1144 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -1241,16 +1153,16 @@ static inline void init_hrtick(void) | |||
1241 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1153 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1242 | #endif | 1154 | #endif |
1243 | 1155 | ||
1244 | static void __resched_task(struct task_struct *p, int tif_bit) | 1156 | static void resched_task(struct task_struct *p) |
1245 | { | 1157 | { |
1246 | int cpu; | 1158 | int cpu; |
1247 | 1159 | ||
1248 | assert_spin_locked(&task_rq(p)->lock); | 1160 | assert_spin_locked(&task_rq(p)->lock); |
1249 | 1161 | ||
1250 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) | 1162 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
1251 | return; | 1163 | return; |
1252 | 1164 | ||
1253 | set_tsk_thread_flag(p, tif_bit); | 1165 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
1254 | 1166 | ||
1255 | cpu = task_cpu(p); | 1167 | cpu = task_cpu(p); |
1256 | if (cpu == smp_processor_id()) | 1168 | if (cpu == smp_processor_id()) |
@@ -1313,15 +1225,15 @@ void wake_up_idle_cpu(int cpu) | |||
1313 | if (!tsk_is_polling(rq->idle)) | 1225 | if (!tsk_is_polling(rq->idle)) |
1314 | smp_send_reschedule(cpu); | 1226 | smp_send_reschedule(cpu); |
1315 | } | 1227 | } |
1316 | #endif | 1228 | #endif /* CONFIG_NO_HZ */ |
1317 | 1229 | ||
1318 | #else | 1230 | #else /* !CONFIG_SMP */ |
1319 | static void __resched_task(struct task_struct *p, int tif_bit) | 1231 | static void resched_task(struct task_struct *p) |
1320 | { | 1232 | { |
1321 | assert_spin_locked(&task_rq(p)->lock); | 1233 | assert_spin_locked(&task_rq(p)->lock); |
1322 | set_tsk_thread_flag(p, tif_bit); | 1234 | set_tsk_need_resched(p); |
1323 | } | 1235 | } |
1324 | #endif | 1236 | #endif /* CONFIG_SMP */ |
1325 | 1237 | ||
1326 | #if BITS_PER_LONG == 32 | 1238 | #if BITS_PER_LONG == 32 |
1327 | # define WMULT_CONST (~0UL) | 1239 | # define WMULT_CONST (~0UL) |
@@ -1336,6 +1248,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1336 | */ | 1248 | */ |
1337 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1249 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1338 | 1250 | ||
1251 | /* | ||
1252 | * delta *= weight / lw | ||
1253 | */ | ||
1339 | static unsigned long | 1254 | static unsigned long |
1340 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1255 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1341 | struct load_weight *lw) | 1256 | struct load_weight *lw) |
@@ -1363,12 +1278,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1363 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1278 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1364 | } | 1279 | } |
1365 | 1280 | ||
1366 | static inline unsigned long | ||
1367 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1368 | { | ||
1369 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1370 | } | ||
1371 | |||
1372 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1281 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1373 | { | 1282 | { |
1374 | lw->weight += inc; | 1283 | lw->weight += inc; |
@@ -1476,20 +1385,227 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1476 | update_load_sub(&rq->load, load); | 1385 | update_load_sub(&rq->load, load); |
1477 | } | 1386 | } |
1478 | 1387 | ||
1388 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | ||
1389 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
1390 | |||
1391 | /* | ||
1392 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1393 | * leaving it for the final time. | ||
1394 | */ | ||
1395 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1396 | { | ||
1397 | struct task_group *parent, *child; | ||
1398 | int ret; | ||
1399 | |||
1400 | rcu_read_lock(); | ||
1401 | parent = &root_task_group; | ||
1402 | down: | ||
1403 | ret = (*down)(parent, data); | ||
1404 | if (ret) | ||
1405 | goto out_unlock; | ||
1406 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1407 | parent = child; | ||
1408 | goto down; | ||
1409 | |||
1410 | up: | ||
1411 | continue; | ||
1412 | } | ||
1413 | ret = (*up)(parent, data); | ||
1414 | if (ret) | ||
1415 | goto out_unlock; | ||
1416 | |||
1417 | child = parent; | ||
1418 | parent = parent->parent; | ||
1419 | if (parent) | ||
1420 | goto up; | ||
1421 | out_unlock: | ||
1422 | rcu_read_unlock(); | ||
1423 | |||
1424 | return ret; | ||
1425 | } | ||
1426 | |||
1427 | static int tg_nop(struct task_group *tg, void *data) | ||
1428 | { | ||
1429 | return 0; | ||
1430 | } | ||
1431 | #endif | ||
1432 | |||
1479 | #ifdef CONFIG_SMP | 1433 | #ifdef CONFIG_SMP |
1480 | static unsigned long source_load(int cpu, int type); | 1434 | static unsigned long source_load(int cpu, int type); |
1481 | static unsigned long target_load(int cpu, int type); | 1435 | static unsigned long target_load(int cpu, int type); |
1482 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1483 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1436 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1484 | #else /* CONFIG_SMP */ | 1437 | |
1438 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1439 | { | ||
1440 | struct rq *rq = cpu_rq(cpu); | ||
1441 | |||
1442 | if (rq->nr_running) | ||
1443 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1444 | |||
1445 | return rq->avg_load_per_task; | ||
1446 | } | ||
1485 | 1447 | ||
1486 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1448 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1487 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1449 | |
1450 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1451 | |||
1452 | /* | ||
1453 | * Calculate and set the cpu's group shares. | ||
1454 | */ | ||
1455 | static void | ||
1456 | __update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1457 | unsigned long sd_shares, unsigned long sd_rq_weight) | ||
1458 | { | ||
1459 | int boost = 0; | ||
1460 | unsigned long shares; | ||
1461 | unsigned long rq_weight; | ||
1462 | |||
1463 | if (!tg->se[cpu]) | ||
1464 | return; | ||
1465 | |||
1466 | rq_weight = tg->cfs_rq[cpu]->load.weight; | ||
1467 | |||
1468 | /* | ||
1469 | * If there are currently no tasks on the cpu pretend there is one of | ||
1470 | * average load so that when a new task gets to run here it will not | ||
1471 | * get delayed by group starvation. | ||
1472 | */ | ||
1473 | if (!rq_weight) { | ||
1474 | boost = 1; | ||
1475 | rq_weight = NICE_0_LOAD; | ||
1476 | } | ||
1477 | |||
1478 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1479 | rq_weight = sd_rq_weight; | ||
1480 | |||
1481 | /* | ||
1482 | * \Sum shares * rq_weight | ||
1483 | * shares = ----------------------- | ||
1484 | * \Sum rq_weight | ||
1485 | * | ||
1486 | */ | ||
1487 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | ||
1488 | |||
1489 | /* | ||
1490 | * record the actual number of shares, not the boosted amount. | ||
1491 | */ | ||
1492 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1493 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1494 | |||
1495 | if (shares < MIN_SHARES) | ||
1496 | shares = MIN_SHARES; | ||
1497 | else if (shares > MAX_SHARES) | ||
1498 | shares = MAX_SHARES; | ||
1499 | |||
1500 | __set_se_shares(tg->se[cpu], shares); | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * Re-compute the task group their per cpu shares over the given domain. | ||
1505 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1506 | * parent group depends on the shares of its child groups. | ||
1507 | */ | ||
1508 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1509 | { | ||
1510 | unsigned long rq_weight = 0; | ||
1511 | unsigned long shares = 0; | ||
1512 | struct sched_domain *sd = data; | ||
1513 | int i; | ||
1514 | |||
1515 | for_each_cpu_mask(i, sd->span) { | ||
1516 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1517 | shares += tg->cfs_rq[i]->shares; | ||
1518 | } | ||
1519 | |||
1520 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1521 | shares = tg->shares; | ||
1522 | |||
1523 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1524 | shares = tg->shares; | ||
1525 | |||
1526 | if (!rq_weight) | ||
1527 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | ||
1528 | |||
1529 | for_each_cpu_mask(i, sd->span) { | ||
1530 | struct rq *rq = cpu_rq(i); | ||
1531 | unsigned long flags; | ||
1532 | |||
1533 | spin_lock_irqsave(&rq->lock, flags); | ||
1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1535 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1536 | } | ||
1537 | |||
1538 | return 0; | ||
1539 | } | ||
1540 | |||
1541 | /* | ||
1542 | * Compute the cpu's hierarchical load factor for each task group. | ||
1543 | * This needs to be done in a top-down fashion because the load of a child | ||
1544 | * group is a fraction of its parents load. | ||
1545 | */ | ||
1546 | static int tg_load_down(struct task_group *tg, void *data) | ||
1547 | { | ||
1548 | unsigned long load; | ||
1549 | long cpu = (long)data; | ||
1550 | |||
1551 | if (!tg->parent) { | ||
1552 | load = cpu_rq(cpu)->load.weight; | ||
1553 | } else { | ||
1554 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
1555 | load *= tg->cfs_rq[cpu]->shares; | ||
1556 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
1557 | } | ||
1558 | |||
1559 | tg->cfs_rq[cpu]->h_load = load; | ||
1560 | |||
1561 | return 0; | ||
1562 | } | ||
1563 | |||
1564 | static void update_shares(struct sched_domain *sd) | ||
1565 | { | ||
1566 | u64 now = cpu_clock(raw_smp_processor_id()); | ||
1567 | s64 elapsed = now - sd->last_update; | ||
1568 | |||
1569 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1570 | sd->last_update = now; | ||
1571 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1572 | } | ||
1573 | } | ||
1574 | |||
1575 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1576 | { | ||
1577 | spin_unlock(&rq->lock); | ||
1578 | update_shares(sd); | ||
1579 | spin_lock(&rq->lock); | ||
1580 | } | ||
1581 | |||
1582 | static void update_h_load(long cpu) | ||
1583 | { | ||
1584 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | ||
1585 | } | ||
1586 | |||
1587 | #else | ||
1588 | |||
1589 | static inline void update_shares(struct sched_domain *sd) | ||
1590 | { | ||
1591 | } | ||
1592 | |||
1593 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1488 | { | 1594 | { |
1489 | } | 1595 | } |
1596 | |||
1490 | #endif | 1597 | #endif |
1491 | 1598 | ||
1492 | #endif /* CONFIG_SMP */ | 1599 | #endif |
1600 | |||
1601 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1602 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1603 | { | ||
1604 | #ifdef CONFIG_SMP | ||
1605 | cfs_rq->shares = shares; | ||
1606 | #endif | ||
1607 | } | ||
1608 | #endif | ||
1493 | 1609 | ||
1494 | #include "sched_stats.h" | 1610 | #include "sched_stats.h" |
1495 | #include "sched_idletask.c" | 1611 | #include "sched_idletask.c" |
@@ -1500,27 +1616,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1500 | #endif | 1616 | #endif |
1501 | 1617 | ||
1502 | #define sched_class_highest (&rt_sched_class) | 1618 | #define sched_class_highest (&rt_sched_class) |
1619 | #define for_each_class(class) \ | ||
1620 | for (class = sched_class_highest; class; class = class->next) | ||
1503 | 1621 | ||
1504 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1622 | static void inc_nr_running(struct rq *rq) |
1505 | { | ||
1506 | update_load_add(&rq->load, p->se.load.weight); | ||
1507 | } | ||
1508 | |||
1509 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1510 | { | ||
1511 | update_load_sub(&rq->load, p->se.load.weight); | ||
1512 | } | ||
1513 | |||
1514 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1515 | { | 1623 | { |
1516 | rq->nr_running++; | 1624 | rq->nr_running++; |
1517 | inc_load(rq, p); | ||
1518 | } | 1625 | } |
1519 | 1626 | ||
1520 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1627 | static void dec_nr_running(struct rq *rq) |
1521 | { | 1628 | { |
1522 | rq->nr_running--; | 1629 | rq->nr_running--; |
1523 | dec_load(rq, p); | ||
1524 | } | 1630 | } |
1525 | 1631 | ||
1526 | static void set_load_weight(struct task_struct *p) | 1632 | static void set_load_weight(struct task_struct *p) |
@@ -1544,6 +1650,12 @@ static void set_load_weight(struct task_struct *p) | |||
1544 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1650 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
1545 | } | 1651 | } |
1546 | 1652 | ||
1653 | static void update_avg(u64 *avg, u64 sample) | ||
1654 | { | ||
1655 | s64 diff = sample - *avg; | ||
1656 | *avg += diff >> 3; | ||
1657 | } | ||
1658 | |||
1547 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1659 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1548 | { | 1660 | { |
1549 | sched_info_queued(p); | 1661 | sched_info_queued(p); |
@@ -1553,6 +1665,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1553 | 1665 | ||
1554 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1666 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1555 | { | 1667 | { |
1668 | if (sleep && p->se.last_wakeup) { | ||
1669 | update_avg(&p->se.avg_overlap, | ||
1670 | p->se.sum_exec_runtime - p->se.last_wakeup); | ||
1671 | p->se.last_wakeup = 0; | ||
1672 | } | ||
1673 | |||
1674 | sched_info_dequeued(p); | ||
1556 | p->sched_class->dequeue_task(rq, p, sleep); | 1675 | p->sched_class->dequeue_task(rq, p, sleep); |
1557 | p->se.on_rq = 0; | 1676 | p->se.on_rq = 0; |
1558 | } | 1677 | } |
@@ -1612,7 +1731,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1612 | rq->nr_uninterruptible--; | 1731 | rq->nr_uninterruptible--; |
1613 | 1732 | ||
1614 | enqueue_task(rq, p, wakeup); | 1733 | enqueue_task(rq, p, wakeup); |
1615 | inc_nr_running(p, rq); | 1734 | inc_nr_running(rq); |
1616 | } | 1735 | } |
1617 | 1736 | ||
1618 | /* | 1737 | /* |
@@ -1624,7 +1743,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1624 | rq->nr_uninterruptible++; | 1743 | rq->nr_uninterruptible++; |
1625 | 1744 | ||
1626 | dequeue_task(rq, p, sleep); | 1745 | dequeue_task(rq, p, sleep); |
1627 | dec_nr_running(p, rq); | 1746 | dec_nr_running(rq); |
1628 | } | 1747 | } |
1629 | 1748 | ||
1630 | /** | 1749 | /** |
@@ -1636,12 +1755,6 @@ inline int task_curr(const struct task_struct *p) | |||
1636 | return cpu_curr(task_cpu(p)) == p; | 1755 | return cpu_curr(task_cpu(p)) == p; |
1637 | } | 1756 | } |
1638 | 1757 | ||
1639 | /* Used instead of source_load when we know the type == 0 */ | ||
1640 | unsigned long weighted_cpuload(const int cpu) | ||
1641 | { | ||
1642 | return cpu_rq(cpu)->load.weight; | ||
1643 | } | ||
1644 | |||
1645 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1758 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1646 | { | 1759 | { |
1647 | set_task_rq(p, cpu); | 1760 | set_task_rq(p, cpu); |
@@ -1670,6 +1783,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1670 | 1783 | ||
1671 | #ifdef CONFIG_SMP | 1784 | #ifdef CONFIG_SMP |
1672 | 1785 | ||
1786 | /* Used instead of source_load when we know the type == 0 */ | ||
1787 | static unsigned long weighted_cpuload(const int cpu) | ||
1788 | { | ||
1789 | return cpu_rq(cpu)->load.weight; | ||
1790 | } | ||
1791 | |||
1673 | /* | 1792 | /* |
1674 | * Is this task likely cache-hot: | 1793 | * Is this task likely cache-hot: |
1675 | */ | 1794 | */ |
@@ -1765,16 +1884,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
1765 | /* | 1884 | /* |
1766 | * wait_task_inactive - wait for a thread to unschedule. | 1885 | * wait_task_inactive - wait for a thread to unschedule. |
1767 | * | 1886 | * |
1887 | * If @match_state is nonzero, it's the @p->state value just checked and | ||
1888 | * not expected to change. If it changes, i.e. @p might have woken up, | ||
1889 | * then return zero. When we succeed in waiting for @p to be off its CPU, | ||
1890 | * we return a positive number (its total switch count). If a second call | ||
1891 | * a short while later returns the same number, the caller can be sure that | ||
1892 | * @p has remained unscheduled the whole time. | ||
1893 | * | ||
1768 | * The caller must ensure that the task *will* unschedule sometime soon, | 1894 | * The caller must ensure that the task *will* unschedule sometime soon, |
1769 | * else this function might spin for a *long* time. This function can't | 1895 | * else this function might spin for a *long* time. This function can't |
1770 | * be called with interrupts off, or it may introduce deadlock with | 1896 | * be called with interrupts off, or it may introduce deadlock with |
1771 | * smp_call_function() if an IPI is sent by the same process we are | 1897 | * smp_call_function() if an IPI is sent by the same process we are |
1772 | * waiting to become inactive. | 1898 | * waiting to become inactive. |
1773 | */ | 1899 | */ |
1774 | void wait_task_inactive(struct task_struct *p) | 1900 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
1775 | { | 1901 | { |
1776 | unsigned long flags; | 1902 | unsigned long flags; |
1777 | int running, on_rq; | 1903 | int running, on_rq; |
1904 | unsigned long ncsw; | ||
1778 | struct rq *rq; | 1905 | struct rq *rq; |
1779 | 1906 | ||
1780 | for (;;) { | 1907 | for (;;) { |
@@ -1797,8 +1924,11 @@ void wait_task_inactive(struct task_struct *p) | |||
1797 | * return false if the runqueue has changed and p | 1924 | * return false if the runqueue has changed and p |
1798 | * is actually now running somewhere else! | 1925 | * is actually now running somewhere else! |
1799 | */ | 1926 | */ |
1800 | while (task_running(rq, p)) | 1927 | while (task_running(rq, p)) { |
1928 | if (match_state && unlikely(p->state != match_state)) | ||
1929 | return 0; | ||
1801 | cpu_relax(); | 1930 | cpu_relax(); |
1931 | } | ||
1802 | 1932 | ||
1803 | /* | 1933 | /* |
1804 | * Ok, time to look more closely! We need the rq | 1934 | * Ok, time to look more closely! We need the rq |
@@ -1808,9 +1938,18 @@ void wait_task_inactive(struct task_struct *p) | |||
1808 | rq = task_rq_lock(p, &flags); | 1938 | rq = task_rq_lock(p, &flags); |
1809 | running = task_running(rq, p); | 1939 | running = task_running(rq, p); |
1810 | on_rq = p->se.on_rq; | 1940 | on_rq = p->se.on_rq; |
1941 | ncsw = 0; | ||
1942 | if (!match_state || p->state == match_state) | ||
1943 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | ||
1811 | task_rq_unlock(rq, &flags); | 1944 | task_rq_unlock(rq, &flags); |
1812 | 1945 | ||
1813 | /* | 1946 | /* |
1947 | * If it changed from the expected state, bail out now. | ||
1948 | */ | ||
1949 | if (unlikely(!ncsw)) | ||
1950 | break; | ||
1951 | |||
1952 | /* | ||
1814 | * Was it really running after all now that we | 1953 | * Was it really running after all now that we |
1815 | * checked with the proper locks actually held? | 1954 | * checked with the proper locks actually held? |
1816 | * | 1955 | * |
@@ -1842,6 +1981,8 @@ void wait_task_inactive(struct task_struct *p) | |||
1842 | */ | 1981 | */ |
1843 | break; | 1982 | break; |
1844 | } | 1983 | } |
1984 | |||
1985 | return ncsw; | ||
1845 | } | 1986 | } |
1846 | 1987 | ||
1847 | /*** | 1988 | /*** |
@@ -1880,7 +2021,7 @@ static unsigned long source_load(int cpu, int type) | |||
1880 | struct rq *rq = cpu_rq(cpu); | 2021 | struct rq *rq = cpu_rq(cpu); |
1881 | unsigned long total = weighted_cpuload(cpu); | 2022 | unsigned long total = weighted_cpuload(cpu); |
1882 | 2023 | ||
1883 | if (type == 0) | 2024 | if (type == 0 || !sched_feat(LB_BIAS)) |
1884 | return total; | 2025 | return total; |
1885 | 2026 | ||
1886 | return min(rq->cpu_load[type-1], total); | 2027 | return min(rq->cpu_load[type-1], total); |
@@ -1895,25 +2036,13 @@ static unsigned long target_load(int cpu, int type) | |||
1895 | struct rq *rq = cpu_rq(cpu); | 2036 | struct rq *rq = cpu_rq(cpu); |
1896 | unsigned long total = weighted_cpuload(cpu); | 2037 | unsigned long total = weighted_cpuload(cpu); |
1897 | 2038 | ||
1898 | if (type == 0) | 2039 | if (type == 0 || !sched_feat(LB_BIAS)) |
1899 | return total; | 2040 | return total; |
1900 | 2041 | ||
1901 | return max(rq->cpu_load[type-1], total); | 2042 | return max(rq->cpu_load[type-1], total); |
1902 | } | 2043 | } |
1903 | 2044 | ||
1904 | /* | 2045 | /* |
1905 | * Return the average load per task on the cpu's run queue | ||
1906 | */ | ||
1907 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1908 | { | ||
1909 | struct rq *rq = cpu_rq(cpu); | ||
1910 | unsigned long total = weighted_cpuload(cpu); | ||
1911 | unsigned long n = rq->nr_running; | ||
1912 | |||
1913 | return n ? total / n : SCHED_LOAD_SCALE; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1917 | * find_idlest_group finds and returns the least busy CPU group within the | 2046 | * find_idlest_group finds and returns the least busy CPU group within the |
1918 | * domain. | 2047 | * domain. |
1919 | */ | 2048 | */ |
@@ -1939,7 +2068,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1939 | /* Tally up the load of all CPUs in the group */ | 2068 | /* Tally up the load of all CPUs in the group */ |
1940 | avg_load = 0; | 2069 | avg_load = 0; |
1941 | 2070 | ||
1942 | for_each_cpu_mask(i, group->cpumask) { | 2071 | for_each_cpu_mask_nr(i, group->cpumask) { |
1943 | /* Bias balancing toward cpus of our domain */ | 2072 | /* Bias balancing toward cpus of our domain */ |
1944 | if (local_group) | 2073 | if (local_group) |
1945 | load = source_load(i, load_idx); | 2074 | load = source_load(i, load_idx); |
@@ -1981,7 +2110,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, | |||
1981 | /* Traverse only the allowed CPUs */ | 2110 | /* Traverse only the allowed CPUs */ |
1982 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); | 2111 | cpus_and(*tmp, group->cpumask, p->cpus_allowed); |
1983 | 2112 | ||
1984 | for_each_cpu_mask(i, *tmp) { | 2113 | for_each_cpu_mask_nr(i, *tmp) { |
1985 | load = weighted_cpuload(i); | 2114 | load = weighted_cpuload(i); |
1986 | 2115 | ||
1987 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2116 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -2019,6 +2148,9 @@ static int sched_balance_self(int cpu, int flag) | |||
2019 | sd = tmp; | 2148 | sd = tmp; |
2020 | } | 2149 | } |
2021 | 2150 | ||
2151 | if (sd) | ||
2152 | update_shares(sd); | ||
2153 | |||
2022 | while (sd) { | 2154 | while (sd) { |
2023 | cpumask_t span, tmpmask; | 2155 | cpumask_t span, tmpmask; |
2024 | struct sched_group *group; | 2156 | struct sched_group *group; |
@@ -2085,6 +2217,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2085 | if (!sched_feat(SYNC_WAKEUPS)) | 2217 | if (!sched_feat(SYNC_WAKEUPS)) |
2086 | sync = 0; | 2218 | sync = 0; |
2087 | 2219 | ||
2220 | #ifdef CONFIG_SMP | ||
2221 | if (sched_feat(LB_WAKEUP_UPDATE)) { | ||
2222 | struct sched_domain *sd; | ||
2223 | |||
2224 | this_cpu = raw_smp_processor_id(); | ||
2225 | cpu = task_cpu(p); | ||
2226 | |||
2227 | for_each_domain(this_cpu, sd) { | ||
2228 | if (cpu_isset(cpu, sd->span)) { | ||
2229 | update_shares(sd); | ||
2230 | break; | ||
2231 | } | ||
2232 | } | ||
2233 | } | ||
2234 | #endif | ||
2235 | |||
2088 | smp_wmb(); | 2236 | smp_wmb(); |
2089 | rq = task_rq_lock(p, &flags); | 2237 | rq = task_rq_lock(p, &flags); |
2090 | old_state = p->state; | 2238 | old_state = p->state; |
@@ -2131,7 +2279,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2131 | } | 2279 | } |
2132 | } | 2280 | } |
2133 | } | 2281 | } |
2134 | #endif | 2282 | #endif /* CONFIG_SCHEDSTATS */ |
2135 | 2283 | ||
2136 | out_activate: | 2284 | out_activate: |
2137 | #endif /* CONFIG_SMP */ | 2285 | #endif /* CONFIG_SMP */ |
@@ -2149,7 +2297,10 @@ out_activate: | |||
2149 | success = 1; | 2297 | success = 1; |
2150 | 2298 | ||
2151 | out_running: | 2299 | out_running: |
2152 | check_preempt_curr(rq, p); | 2300 | trace_mark(kernel_sched_wakeup, |
2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2302 | p->pid, p->state, rq, p, rq->curr); | ||
2303 | check_preempt_curr(rq, p, sync); | ||
2153 | 2304 | ||
2154 | p->state = TASK_RUNNING; | 2305 | p->state = TASK_RUNNING; |
2155 | #ifdef CONFIG_SMP | 2306 | #ifdef CONFIG_SMP |
@@ -2157,6 +2308,8 @@ out_running: | |||
2157 | p->sched_class->task_wake_up(rq, p); | 2308 | p->sched_class->task_wake_up(rq, p); |
2158 | #endif | 2309 | #endif |
2159 | out: | 2310 | out: |
2311 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
2312 | |||
2160 | task_rq_unlock(rq, &flags); | 2313 | task_rq_unlock(rq, &flags); |
2161 | 2314 | ||
2162 | return success; | 2315 | return success; |
@@ -2277,9 +2430,12 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2277 | * management (if any): | 2430 | * management (if any): |
2278 | */ | 2431 | */ |
2279 | p->sched_class->task_new(rq, p); | 2432 | p->sched_class->task_new(rq, p); |
2280 | inc_nr_running(p, rq); | 2433 | inc_nr_running(rq); |
2281 | } | 2434 | } |
2282 | check_preempt_curr(rq, p); | 2435 | trace_mark(kernel_sched_wakeup_new, |
2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2437 | p->pid, p->state, rq, p, rq->curr); | ||
2438 | check_preempt_curr(rq, p, 0); | ||
2283 | #ifdef CONFIG_SMP | 2439 | #ifdef CONFIG_SMP |
2284 | if (p->sched_class->task_wake_up) | 2440 | if (p->sched_class->task_wake_up) |
2285 | p->sched_class->task_wake_up(rq, p); | 2441 | p->sched_class->task_wake_up(rq, p); |
@@ -2331,7 +2487,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2331 | notifier->ops->sched_out(notifier, next); | 2487 | notifier->ops->sched_out(notifier, next); |
2332 | } | 2488 | } |
2333 | 2489 | ||
2334 | #else | 2490 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
2335 | 2491 | ||
2336 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2492 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2337 | { | 2493 | { |
@@ -2343,7 +2499,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2343 | { | 2499 | { |
2344 | } | 2500 | } |
2345 | 2501 | ||
2346 | #endif | 2502 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
2347 | 2503 | ||
2348 | /** | 2504 | /** |
2349 | * prepare_task_switch - prepare to switch tasks | 2505 | * prepare_task_switch - prepare to switch tasks |
@@ -2451,6 +2607,11 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2451 | struct mm_struct *mm, *oldmm; | 2607 | struct mm_struct *mm, *oldmm; |
2452 | 2608 | ||
2453 | prepare_task_switch(rq, prev, next); | 2609 | prepare_task_switch(rq, prev, next); |
2610 | trace_mark(kernel_sched_schedule, | ||
2611 | "prev_pid %d next_pid %d prev_state %ld " | ||
2612 | "## rq %p prev %p next %p", | ||
2613 | prev->pid, next->pid, prev->state, | ||
2614 | rq, prev, next); | ||
2454 | mm = next->mm; | 2615 | mm = next->mm; |
2455 | oldmm = prev->active_mm; | 2616 | oldmm = prev->active_mm; |
2456 | /* | 2617 | /* |
@@ -2612,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
2612 | } else { | 2773 | } else { |
2613 | if (rq1 < rq2) { | 2774 | if (rq1 < rq2) { |
2614 | spin_lock(&rq1->lock); | 2775 | spin_lock(&rq1->lock); |
2615 | spin_lock(&rq2->lock); | 2776 | spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); |
2616 | } else { | 2777 | } else { |
2617 | spin_lock(&rq2->lock); | 2778 | spin_lock(&rq2->lock); |
2618 | spin_lock(&rq1->lock); | 2779 | spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); |
2619 | } | 2780 | } |
2620 | } | 2781 | } |
2621 | update_rq_clock(rq1); | 2782 | update_rq_clock(rq1); |
@@ -2658,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2658 | if (busiest < this_rq) { | 2819 | if (busiest < this_rq) { |
2659 | spin_unlock(&this_rq->lock); | 2820 | spin_unlock(&this_rq->lock); |
2660 | spin_lock(&busiest->lock); | 2821 | spin_lock(&busiest->lock); |
2661 | spin_lock(&this_rq->lock); | 2822 | spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); |
2662 | ret = 1; | 2823 | ret = 1; |
2663 | } else | 2824 | } else |
2664 | spin_lock(&busiest->lock); | 2825 | spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); |
2665 | } | 2826 | } |
2666 | return ret; | 2827 | return ret; |
2667 | } | 2828 | } |
2668 | 2829 | ||
2830 | static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
2831 | __releases(busiest->lock) | ||
2832 | { | ||
2833 | spin_unlock(&busiest->lock); | ||
2834 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
2835 | } | ||
2836 | |||
2669 | /* | 2837 | /* |
2670 | * If dest_cpu is allowed for this process, migrate the task to it. | 2838 | * If dest_cpu is allowed for this process, migrate the task to it. |
2671 | * This is accomplished by forcing the cpu_allowed mask to only | 2839 | * This is accomplished by forcing the cpu_allowed mask to only |
@@ -2680,7 +2848,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) | |||
2680 | 2848 | ||
2681 | rq = task_rq_lock(p, &flags); | 2849 | rq = task_rq_lock(p, &flags); |
2682 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 2850 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
2683 | || unlikely(cpu_is_offline(dest_cpu))) | 2851 | || unlikely(!cpu_active(dest_cpu))) |
2684 | goto out; | 2852 | goto out; |
2685 | 2853 | ||
2686 | /* force the process onto the specified CPU */ | 2854 | /* force the process onto the specified CPU */ |
@@ -2727,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2727 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2895 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2728 | * to be always true for them. | 2896 | * to be always true for them. |
2729 | */ | 2897 | */ |
2730 | check_preempt_curr(this_rq, p); | 2898 | check_preempt_curr(this_rq, p, 0); |
2731 | } | 2899 | } |
2732 | 2900 | ||
2733 | /* | 2901 | /* |
@@ -2785,7 +2953,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2785 | enum cpu_idle_type idle, int *all_pinned, | 2953 | enum cpu_idle_type idle, int *all_pinned, |
2786 | int *this_best_prio, struct rq_iterator *iterator) | 2954 | int *this_best_prio, struct rq_iterator *iterator) |
2787 | { | 2955 | { |
2788 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2956 | int loops = 0, pulled = 0, pinned = 0; |
2789 | struct task_struct *p; | 2957 | struct task_struct *p; |
2790 | long rem_load_move = max_load_move; | 2958 | long rem_load_move = max_load_move; |
2791 | 2959 | ||
@@ -2801,14 +2969,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2801 | next: | 2969 | next: |
2802 | if (!p || loops++ > sysctl_sched_nr_migrate) | 2970 | if (!p || loops++ > sysctl_sched_nr_migrate) |
2803 | goto out; | 2971 | goto out; |
2804 | /* | 2972 | |
2805 | * To help distribute high priority tasks across CPUs we don't | 2973 | if ((p->se.load.weight >> 1) > rem_load_move || |
2806 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2807 | * prio value) on its new queue regardless of its load weight | ||
2808 | */ | ||
2809 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | ||
2810 | SCHED_LOAD_SCALE_FUZZ; | ||
2811 | if ((skip_for_load && p->prio >= *this_best_prio) || | ||
2812 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2974 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2813 | p = iterator->next(iterator->arg); | 2975 | p = iterator->next(iterator->arg); |
2814 | goto next; | 2976 | goto next; |
@@ -2863,6 +3025,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2863 | max_load_move - total_load_moved, | 3025 | max_load_move - total_load_moved, |
2864 | sd, idle, all_pinned, &this_best_prio); | 3026 | sd, idle, all_pinned, &this_best_prio); |
2865 | class = class->next; | 3027 | class = class->next; |
3028 | |||
3029 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3030 | break; | ||
3031 | |||
2866 | } while (class && max_load_move > total_load_moved); | 3032 | } while (class && max_load_move > total_load_moved); |
2867 | 3033 | ||
2868 | return total_load_moved > 0; | 3034 | return total_load_moved > 0; |
@@ -2939,6 +3105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2939 | max_load = this_load = total_load = total_pwr = 0; | 3105 | max_load = this_load = total_load = total_pwr = 0; |
2940 | busiest_load_per_task = busiest_nr_running = 0; | 3106 | busiest_load_per_task = busiest_nr_running = 0; |
2941 | this_load_per_task = this_nr_running = 0; | 3107 | this_load_per_task = this_nr_running = 0; |
3108 | |||
2942 | if (idle == CPU_NOT_IDLE) | 3109 | if (idle == CPU_NOT_IDLE) |
2943 | load_idx = sd->busy_idx; | 3110 | load_idx = sd->busy_idx; |
2944 | else if (idle == CPU_NEWLY_IDLE) | 3111 | else if (idle == CPU_NEWLY_IDLE) |
@@ -2953,6 +3120,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2953 | int __group_imb = 0; | 3120 | int __group_imb = 0; |
2954 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3121 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2955 | unsigned long sum_nr_running, sum_weighted_load; | 3122 | unsigned long sum_nr_running, sum_weighted_load; |
3123 | unsigned long sum_avg_load_per_task; | ||
3124 | unsigned long avg_load_per_task; | ||
2956 | 3125 | ||
2957 | local_group = cpu_isset(this_cpu, group->cpumask); | 3126 | local_group = cpu_isset(this_cpu, group->cpumask); |
2958 | 3127 | ||
@@ -2961,10 +3130,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2961 | 3130 | ||
2962 | /* Tally up the load of all CPUs in the group */ | 3131 | /* Tally up the load of all CPUs in the group */ |
2963 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3132 | sum_weighted_load = sum_nr_running = avg_load = 0; |
3133 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3134 | |||
2964 | max_cpu_load = 0; | 3135 | max_cpu_load = 0; |
2965 | min_cpu_load = ~0UL; | 3136 | min_cpu_load = ~0UL; |
2966 | 3137 | ||
2967 | for_each_cpu_mask(i, group->cpumask) { | 3138 | for_each_cpu_mask_nr(i, group->cpumask) { |
2968 | struct rq *rq; | 3139 | struct rq *rq; |
2969 | 3140 | ||
2970 | if (!cpu_isset(i, *cpus)) | 3141 | if (!cpu_isset(i, *cpus)) |
@@ -2994,6 +3165,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2994 | avg_load += load; | 3165 | avg_load += load; |
2995 | sum_nr_running += rq->nr_running; | 3166 | sum_nr_running += rq->nr_running; |
2996 | sum_weighted_load += weighted_cpuload(i); | 3167 | sum_weighted_load += weighted_cpuload(i); |
3168 | |||
3169 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
2997 | } | 3170 | } |
2998 | 3171 | ||
2999 | /* | 3172 | /* |
@@ -3015,7 +3188,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3015 | avg_load = sg_div_cpu_power(group, | 3188 | avg_load = sg_div_cpu_power(group, |
3016 | avg_load * SCHED_LOAD_SCALE); | 3189 | avg_load * SCHED_LOAD_SCALE); |
3017 | 3190 | ||
3018 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3191 | |
3192 | /* | ||
3193 | * Consider the group unbalanced when the imbalance is larger | ||
3194 | * than the average weight of two tasks. | ||
3195 | * | ||
3196 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3197 | * might not be a suitable number - should we keep a | ||
3198 | * normalized nr_running number somewhere that negates | ||
3199 | * the hierarchy? | ||
3200 | */ | ||
3201 | avg_load_per_task = sg_div_cpu_power(group, | ||
3202 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
3203 | |||
3204 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3019 | __group_imb = 1; | 3205 | __group_imb = 1; |
3020 | 3206 | ||
3021 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3207 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
@@ -3156,9 +3342,9 @@ small_imbalance: | |||
3156 | if (busiest_load_per_task > this_load_per_task) | 3342 | if (busiest_load_per_task > this_load_per_task) |
3157 | imbn = 1; | 3343 | imbn = 1; |
3158 | } else | 3344 | } else |
3159 | this_load_per_task = SCHED_LOAD_SCALE; | 3345 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3160 | 3346 | ||
3161 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3347 | if (max_load - this_load + 2*busiest_load_per_task >= |
3162 | busiest_load_per_task * imbn) { | 3348 | busiest_load_per_task * imbn) { |
3163 | *imbalance = busiest_load_per_task; | 3349 | *imbalance = busiest_load_per_task; |
3164 | return busiest; | 3350 | return busiest; |
@@ -3228,7 +3414,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3228 | unsigned long max_load = 0; | 3414 | unsigned long max_load = 0; |
3229 | int i; | 3415 | int i; |
3230 | 3416 | ||
3231 | for_each_cpu_mask(i, group->cpumask) { | 3417 | for_each_cpu_mask_nr(i, group->cpumask) { |
3232 | unsigned long wl; | 3418 | unsigned long wl; |
3233 | 3419 | ||
3234 | if (!cpu_isset(i, *cpus)) | 3420 | if (!cpu_isset(i, *cpus)) |
@@ -3284,6 +3470,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3284 | schedstat_inc(sd, lb_count[idle]); | 3470 | schedstat_inc(sd, lb_count[idle]); |
3285 | 3471 | ||
3286 | redo: | 3472 | redo: |
3473 | update_shares(sd); | ||
3287 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3474 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3288 | cpus, balance); | 3475 | cpus, balance); |
3289 | 3476 | ||
@@ -3386,8 +3573,9 @@ redo: | |||
3386 | 3573 | ||
3387 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3574 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3388 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3575 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3389 | return -1; | 3576 | ld_moved = -1; |
3390 | return ld_moved; | 3577 | |
3578 | goto out; | ||
3391 | 3579 | ||
3392 | out_balanced: | 3580 | out_balanced: |
3393 | schedstat_inc(sd, lb_balanced[idle]); | 3581 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3402,8 +3590,13 @@ out_one_pinned: | |||
3402 | 3590 | ||
3403 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3591 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3404 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3592 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3405 | return -1; | 3593 | ld_moved = -1; |
3406 | return 0; | 3594 | else |
3595 | ld_moved = 0; | ||
3596 | out: | ||
3597 | if (ld_moved) | ||
3598 | update_shares(sd); | ||
3599 | return ld_moved; | ||
3407 | } | 3600 | } |
3408 | 3601 | ||
3409 | /* | 3602 | /* |
@@ -3438,6 +3631,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, | |||
3438 | 3631 | ||
3439 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3632 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3440 | redo: | 3633 | redo: |
3634 | update_shares_locked(this_rq, sd); | ||
3441 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3635 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3442 | &sd_idle, cpus, NULL); | 3636 | &sd_idle, cpus, NULL); |
3443 | if (!group) { | 3637 | if (!group) { |
@@ -3464,7 +3658,7 @@ redo: | |||
3464 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3658 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
3465 | imbalance, sd, CPU_NEWLY_IDLE, | 3659 | imbalance, sd, CPU_NEWLY_IDLE, |
3466 | &all_pinned); | 3660 | &all_pinned); |
3467 | spin_unlock(&busiest->lock); | 3661 | double_unlock_balance(this_rq, busiest); |
3468 | 3662 | ||
3469 | if (unlikely(all_pinned)) { | 3663 | if (unlikely(all_pinned)) { |
3470 | cpu_clear(cpu_of(busiest), *cpus); | 3664 | cpu_clear(cpu_of(busiest), *cpus); |
@@ -3481,6 +3675,7 @@ redo: | |||
3481 | } else | 3675 | } else |
3482 | sd->nr_balance_failed = 0; | 3676 | sd->nr_balance_failed = 0; |
3483 | 3677 | ||
3678 | update_shares_locked(this_rq, sd); | ||
3484 | return ld_moved; | 3679 | return ld_moved; |
3485 | 3680 | ||
3486 | out_balanced: | 3681 | out_balanced: |
@@ -3578,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3578 | else | 3773 | else |
3579 | schedstat_inc(sd, alb_failed); | 3774 | schedstat_inc(sd, alb_failed); |
3580 | } | 3775 | } |
3581 | spin_unlock(&target_rq->lock); | 3776 | double_unlock_balance(busiest_rq, target_rq); |
3582 | } | 3777 | } |
3583 | 3778 | ||
3584 | #ifdef CONFIG_NO_HZ | 3779 | #ifdef CONFIG_NO_HZ |
@@ -3621,7 +3816,7 @@ int select_nohz_load_balancer(int stop_tick) | |||
3621 | /* | 3816 | /* |
3622 | * If we are going offline and still the leader, give up! | 3817 | * If we are going offline and still the leader, give up! |
3623 | */ | 3818 | */ |
3624 | if (cpu_is_offline(cpu) && | 3819 | if (!cpu_active(cpu) && |
3625 | atomic_read(&nohz.load_balancer) == cpu) { | 3820 | atomic_read(&nohz.load_balancer) == cpu) { |
3626 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3821 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
3627 | BUG(); | 3822 | BUG(); |
@@ -3672,6 +3867,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3672 | /* Earliest time when we have to do rebalance again */ | 3867 | /* Earliest time when we have to do rebalance again */ |
3673 | unsigned long next_balance = jiffies + 60*HZ; | 3868 | unsigned long next_balance = jiffies + 60*HZ; |
3674 | int update_next_balance = 0; | 3869 | int update_next_balance = 0; |
3870 | int need_serialize; | ||
3675 | cpumask_t tmp; | 3871 | cpumask_t tmp; |
3676 | 3872 | ||
3677 | for_each_domain(cpu, sd) { | 3873 | for_each_domain(cpu, sd) { |
@@ -3689,8 +3885,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3689 | if (interval > HZ*NR_CPUS/10) | 3885 | if (interval > HZ*NR_CPUS/10) |
3690 | interval = HZ*NR_CPUS/10; | 3886 | interval = HZ*NR_CPUS/10; |
3691 | 3887 | ||
3888 | need_serialize = sd->flags & SD_SERIALIZE; | ||
3692 | 3889 | ||
3693 | if (sd->flags & SD_SERIALIZE) { | 3890 | if (need_serialize) { |
3694 | if (!spin_trylock(&balancing)) | 3891 | if (!spin_trylock(&balancing)) |
3695 | goto out; | 3892 | goto out; |
3696 | } | 3893 | } |
@@ -3706,7 +3903,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3706 | } | 3903 | } |
3707 | sd->last_balance = jiffies; | 3904 | sd->last_balance = jiffies; |
3708 | } | 3905 | } |
3709 | if (sd->flags & SD_SERIALIZE) | 3906 | if (need_serialize) |
3710 | spin_unlock(&balancing); | 3907 | spin_unlock(&balancing); |
3711 | out: | 3908 | out: |
3712 | if (time_after(next_balance, sd->last_balance + interval)) { | 3909 | if (time_after(next_balance, sd->last_balance + interval)) { |
@@ -3759,7 +3956,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3759 | int balance_cpu; | 3956 | int balance_cpu; |
3760 | 3957 | ||
3761 | cpu_clear(this_cpu, cpus); | 3958 | cpu_clear(this_cpu, cpus); |
3762 | for_each_cpu_mask(balance_cpu, cpus) { | 3959 | for_each_cpu_mask_nr(balance_cpu, cpus) { |
3763 | /* | 3960 | /* |
3764 | * If this cpu gets work to do, stop the load balancing | 3961 | * If this cpu gets work to do, stop the load balancing |
3765 | * work being done for other cpus. Next load | 3962 | * work being done for other cpus. Next load |
@@ -3895,6 +4092,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
3895 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 4092 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3896 | else | 4093 | else |
3897 | cpustat->user = cputime64_add(cpustat->user, tmp); | 4094 | cpustat->user = cputime64_add(cpustat->user, tmp); |
4095 | /* Account for user time used */ | ||
4096 | acct_update_integrals(p); | ||
3898 | } | 4097 | } |
3899 | 4098 | ||
3900 | /* | 4099 | /* |
@@ -3995,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3995 | } | 4194 | } |
3996 | 4195 | ||
3997 | /* | 4196 | /* |
4197 | * Use precise platform statistics if available: | ||
4198 | */ | ||
4199 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
4200 | cputime_t task_utime(struct task_struct *p) | ||
4201 | { | ||
4202 | return p->utime; | ||
4203 | } | ||
4204 | |||
4205 | cputime_t task_stime(struct task_struct *p) | ||
4206 | { | ||
4207 | return p->stime; | ||
4208 | } | ||
4209 | #else | ||
4210 | cputime_t task_utime(struct task_struct *p) | ||
4211 | { | ||
4212 | clock_t utime = cputime_to_clock_t(p->utime), | ||
4213 | total = utime + cputime_to_clock_t(p->stime); | ||
4214 | u64 temp; | ||
4215 | |||
4216 | /* | ||
4217 | * Use CFS's precise accounting: | ||
4218 | */ | ||
4219 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | ||
4220 | |||
4221 | if (total) { | ||
4222 | temp *= utime; | ||
4223 | do_div(temp, total); | ||
4224 | } | ||
4225 | utime = (clock_t)temp; | ||
4226 | |||
4227 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | ||
4228 | return p->prev_utime; | ||
4229 | } | ||
4230 | |||
4231 | cputime_t task_stime(struct task_struct *p) | ||
4232 | { | ||
4233 | clock_t stime; | ||
4234 | |||
4235 | /* | ||
4236 | * Use CFS's precise accounting. (we subtract utime from | ||
4237 | * the total, to make sure the total observed by userspace | ||
4238 | * grows monotonically - apps rely on that): | ||
4239 | */ | ||
4240 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
4241 | cputime_to_clock_t(task_utime(p)); | ||
4242 | |||
4243 | if (stime >= 0) | ||
4244 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | ||
4245 | |||
4246 | return p->prev_stime; | ||
4247 | } | ||
4248 | #endif | ||
4249 | |||
4250 | inline cputime_t task_gtime(struct task_struct *p) | ||
4251 | { | ||
4252 | return p->gtime; | ||
4253 | } | ||
4254 | |||
4255 | /* | ||
3998 | * This function gets called by the timer code, with HZ frequency. | 4256 | * This function gets called by the timer code, with HZ frequency. |
3999 | * We call it with interrupts disabled. | 4257 | * We call it with interrupts disabled. |
4000 | * | 4258 | * |
@@ -4021,26 +4279,44 @@ void scheduler_tick(void) | |||
4021 | #endif | 4279 | #endif |
4022 | } | 4280 | } |
4023 | 4281 | ||
4024 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 4282 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
4283 | defined(CONFIG_PREEMPT_TRACER)) | ||
4284 | |||
4285 | static inline unsigned long get_parent_ip(unsigned long addr) | ||
4286 | { | ||
4287 | if (in_lock_functions(addr)) { | ||
4288 | addr = CALLER_ADDR2; | ||
4289 | if (in_lock_functions(addr)) | ||
4290 | addr = CALLER_ADDR3; | ||
4291 | } | ||
4292 | return addr; | ||
4293 | } | ||
4025 | 4294 | ||
4026 | void __kprobes add_preempt_count(int val) | 4295 | void __kprobes add_preempt_count(int val) |
4027 | { | 4296 | { |
4297 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4028 | /* | 4298 | /* |
4029 | * Underflow? | 4299 | * Underflow? |
4030 | */ | 4300 | */ |
4031 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 4301 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
4032 | return; | 4302 | return; |
4303 | #endif | ||
4033 | preempt_count() += val; | 4304 | preempt_count() += val; |
4305 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4034 | /* | 4306 | /* |
4035 | * Spinlock count overflowing soon? | 4307 | * Spinlock count overflowing soon? |
4036 | */ | 4308 | */ |
4037 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 4309 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
4038 | PREEMPT_MASK - 10); | 4310 | PREEMPT_MASK - 10); |
4311 | #endif | ||
4312 | if (preempt_count() == val) | ||
4313 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4039 | } | 4314 | } |
4040 | EXPORT_SYMBOL(add_preempt_count); | 4315 | EXPORT_SYMBOL(add_preempt_count); |
4041 | 4316 | ||
4042 | void __kprobes sub_preempt_count(int val) | 4317 | void __kprobes sub_preempt_count(int val) |
4043 | { | 4318 | { |
4319 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4044 | /* | 4320 | /* |
4045 | * Underflow? | 4321 | * Underflow? |
4046 | */ | 4322 | */ |
@@ -4052,7 +4328,10 @@ void __kprobes sub_preempt_count(int val) | |||
4052 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 4328 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
4053 | !(preempt_count() & PREEMPT_MASK))) | 4329 | !(preempt_count() & PREEMPT_MASK))) |
4054 | return; | 4330 | return; |
4331 | #endif | ||
4055 | 4332 | ||
4333 | if (preempt_count() == val) | ||
4334 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4056 | preempt_count() -= val; | 4335 | preempt_count() -= val; |
4057 | } | 4336 | } |
4058 | EXPORT_SYMBOL(sub_preempt_count); | 4337 | EXPORT_SYMBOL(sub_preempt_count); |
@@ -4070,6 +4349,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4070 | prev->comm, prev->pid, preempt_count()); | 4349 | prev->comm, prev->pid, preempt_count()); |
4071 | 4350 | ||
4072 | debug_show_held_locks(prev); | 4351 | debug_show_held_locks(prev); |
4352 | print_modules(); | ||
4073 | if (irqs_disabled()) | 4353 | if (irqs_disabled()) |
4074 | print_irqtrace_events(prev); | 4354 | print_irqtrace_events(prev); |
4075 | 4355 | ||
@@ -4158,7 +4438,8 @@ need_resched_nonpreemptible: | |||
4158 | 4438 | ||
4159 | schedule_debug(prev); | 4439 | schedule_debug(prev); |
4160 | 4440 | ||
4161 | hrtick_clear(rq); | 4441 | if (sched_feat(HRTICK)) |
4442 | hrtick_clear(rq); | ||
4162 | 4443 | ||
4163 | /* | 4444 | /* |
4164 | * Do the rq-clock update outside the rq lock: | 4445 | * Do the rq-clock update outside the rq lock: |
@@ -4204,8 +4485,6 @@ need_resched_nonpreemptible: | |||
4204 | } else | 4485 | } else |
4205 | spin_unlock_irq(&rq->lock); | 4486 | spin_unlock_irq(&rq->lock); |
4206 | 4487 | ||
4207 | hrtick_set(rq); | ||
4208 | |||
4209 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 4488 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
4210 | goto need_resched_nonpreemptible; | 4489 | goto need_resched_nonpreemptible; |
4211 | 4490 | ||
@@ -4363,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | |||
4363 | } | 4642 | } |
4364 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4643 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4365 | 4644 | ||
4645 | /** | ||
4646 | * complete: - signals a single thread waiting on this completion | ||
4647 | * @x: holds the state of this particular completion | ||
4648 | * | ||
4649 | * This will wake up a single thread waiting on this completion. Threads will be | ||
4650 | * awakened in the same order in which they were queued. | ||
4651 | * | ||
4652 | * See also complete_all(), wait_for_completion() and related routines. | ||
4653 | */ | ||
4366 | void complete(struct completion *x) | 4654 | void complete(struct completion *x) |
4367 | { | 4655 | { |
4368 | unsigned long flags; | 4656 | unsigned long flags; |
@@ -4374,6 +4662,12 @@ void complete(struct completion *x) | |||
4374 | } | 4662 | } |
4375 | EXPORT_SYMBOL(complete); | 4663 | EXPORT_SYMBOL(complete); |
4376 | 4664 | ||
4665 | /** | ||
4666 | * complete_all: - signals all threads waiting on this completion | ||
4667 | * @x: holds the state of this particular completion | ||
4668 | * | ||
4669 | * This will wake up all threads waiting on this particular completion event. | ||
4670 | */ | ||
4377 | void complete_all(struct completion *x) | 4671 | void complete_all(struct completion *x) |
4378 | { | 4672 | { |
4379 | unsigned long flags; | 4673 | unsigned long flags; |
@@ -4394,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4394 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4688 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
4395 | __add_wait_queue_tail(&x->wait, &wait); | 4689 | __add_wait_queue_tail(&x->wait, &wait); |
4396 | do { | 4690 | do { |
4397 | if ((state == TASK_INTERRUPTIBLE && | 4691 | if (signal_pending_state(state, current)) { |
4398 | signal_pending(current)) || | ||
4399 | (state == TASK_KILLABLE && | ||
4400 | fatal_signal_pending(current))) { | ||
4401 | timeout = -ERESTARTSYS; | 4692 | timeout = -ERESTARTSYS; |
4402 | break; | 4693 | break; |
4403 | } | 4694 | } |
@@ -4425,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) | |||
4425 | return timeout; | 4716 | return timeout; |
4426 | } | 4717 | } |
4427 | 4718 | ||
4719 | /** | ||
4720 | * wait_for_completion: - waits for completion of a task | ||
4721 | * @x: holds the state of this particular completion | ||
4722 | * | ||
4723 | * This waits to be signaled for completion of a specific task. It is NOT | ||
4724 | * interruptible and there is no timeout. | ||
4725 | * | ||
4726 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
4727 | * and interrupt capability. Also see complete(). | ||
4728 | */ | ||
4428 | void __sched wait_for_completion(struct completion *x) | 4729 | void __sched wait_for_completion(struct completion *x) |
4429 | { | 4730 | { |
4430 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4731 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4431 | } | 4732 | } |
4432 | EXPORT_SYMBOL(wait_for_completion); | 4733 | EXPORT_SYMBOL(wait_for_completion); |
4433 | 4734 | ||
4735 | /** | ||
4736 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
4737 | * @x: holds the state of this particular completion | ||
4738 | * @timeout: timeout value in jiffies | ||
4739 | * | ||
4740 | * This waits for either a completion of a specific task to be signaled or for a | ||
4741 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
4742 | * interruptible. | ||
4743 | */ | ||
4434 | unsigned long __sched | 4744 | unsigned long __sched |
4435 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4745 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4436 | { | 4746 | { |
@@ -4438,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
4438 | } | 4748 | } |
4439 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4749 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4440 | 4750 | ||
4751 | /** | ||
4752 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
4753 | * @x: holds the state of this particular completion | ||
4754 | * | ||
4755 | * This waits for completion of a specific task to be signaled. It is | ||
4756 | * interruptible. | ||
4757 | */ | ||
4441 | int __sched wait_for_completion_interruptible(struct completion *x) | 4758 | int __sched wait_for_completion_interruptible(struct completion *x) |
4442 | { | 4759 | { |
4443 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4760 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
@@ -4447,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) | |||
4447 | } | 4764 | } |
4448 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4765 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4449 | 4766 | ||
4767 | /** | ||
4768 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
4769 | * @x: holds the state of this particular completion | ||
4770 | * @timeout: timeout value in jiffies | ||
4771 | * | ||
4772 | * This waits for either a completion of a specific task to be signaled or for a | ||
4773 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
4774 | */ | ||
4450 | unsigned long __sched | 4775 | unsigned long __sched |
4451 | wait_for_completion_interruptible_timeout(struct completion *x, | 4776 | wait_for_completion_interruptible_timeout(struct completion *x, |
4452 | unsigned long timeout) | 4777 | unsigned long timeout) |
@@ -4455,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
4455 | } | 4780 | } |
4456 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4781 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4457 | 4782 | ||
4783 | /** | ||
4784 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
4785 | * @x: holds the state of this particular completion | ||
4786 | * | ||
4787 | * This waits to be signaled for completion of a specific task. It can be | ||
4788 | * interrupted by a kill signal. | ||
4789 | */ | ||
4458 | int __sched wait_for_completion_killable(struct completion *x) | 4790 | int __sched wait_for_completion_killable(struct completion *x) |
4459 | { | 4791 | { |
4460 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | 4792 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); |
@@ -4464,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x) | |||
4464 | } | 4796 | } |
4465 | EXPORT_SYMBOL(wait_for_completion_killable); | 4797 | EXPORT_SYMBOL(wait_for_completion_killable); |
4466 | 4798 | ||
4799 | /** | ||
4800 | * try_wait_for_completion - try to decrement a completion without blocking | ||
4801 | * @x: completion structure | ||
4802 | * | ||
4803 | * Returns: 0 if a decrement cannot be done without blocking | ||
4804 | * 1 if a decrement succeeded. | ||
4805 | * | ||
4806 | * If a completion is being used as a counting completion, | ||
4807 | * attempt to decrement the counter without blocking. This | ||
4808 | * enables us to avoid waiting if the resource the completion | ||
4809 | * is protecting is not available. | ||
4810 | */ | ||
4811 | bool try_wait_for_completion(struct completion *x) | ||
4812 | { | ||
4813 | int ret = 1; | ||
4814 | |||
4815 | spin_lock_irq(&x->wait.lock); | ||
4816 | if (!x->done) | ||
4817 | ret = 0; | ||
4818 | else | ||
4819 | x->done--; | ||
4820 | spin_unlock_irq(&x->wait.lock); | ||
4821 | return ret; | ||
4822 | } | ||
4823 | EXPORT_SYMBOL(try_wait_for_completion); | ||
4824 | |||
4825 | /** | ||
4826 | * completion_done - Test to see if a completion has any waiters | ||
4827 | * @x: completion structure | ||
4828 | * | ||
4829 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | ||
4830 | * 1 if there are no waiters. | ||
4831 | * | ||
4832 | */ | ||
4833 | bool completion_done(struct completion *x) | ||
4834 | { | ||
4835 | int ret = 1; | ||
4836 | |||
4837 | spin_lock_irq(&x->wait.lock); | ||
4838 | if (!x->done) | ||
4839 | ret = 0; | ||
4840 | spin_unlock_irq(&x->wait.lock); | ||
4841 | return ret; | ||
4842 | } | ||
4843 | EXPORT_SYMBOL(completion_done); | ||
4844 | |||
4467 | static long __sched | 4845 | static long __sched |
4468 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 4846 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
4469 | { | 4847 | { |
@@ -4586,10 +4964,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4586 | goto out_unlock; | 4964 | goto out_unlock; |
4587 | } | 4965 | } |
4588 | on_rq = p->se.on_rq; | 4966 | on_rq = p->se.on_rq; |
4589 | if (on_rq) { | 4967 | if (on_rq) |
4590 | dequeue_task(rq, p, 0); | 4968 | dequeue_task(rq, p, 0); |
4591 | dec_load(rq, p); | ||
4592 | } | ||
4593 | 4969 | ||
4594 | p->static_prio = NICE_TO_PRIO(nice); | 4970 | p->static_prio = NICE_TO_PRIO(nice); |
4595 | set_load_weight(p); | 4971 | set_load_weight(p); |
@@ -4599,7 +4975,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4599 | 4975 | ||
4600 | if (on_rq) { | 4976 | if (on_rq) { |
4601 | enqueue_task(rq, p, 0); | 4977 | enqueue_task(rq, p, 0); |
4602 | inc_load(rq, p); | ||
4603 | /* | 4978 | /* |
4604 | * If the task increased its priority or is running and | 4979 | * If the task increased its priority or is running and |
4605 | * lowered its priority, then reschedule its CPU: | 4980 | * lowered its priority, then reschedule its CPU: |
@@ -4744,16 +5119,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
4744 | set_load_weight(p); | 5119 | set_load_weight(p); |
4745 | } | 5120 | } |
4746 | 5121 | ||
4747 | /** | 5122 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4748 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 5123 | struct sched_param *param, bool user) |
4749 | * @p: the task in question. | ||
4750 | * @policy: new policy. | ||
4751 | * @param: structure containing the new RT priority. | ||
4752 | * | ||
4753 | * NOTE that the task may be already dead. | ||
4754 | */ | ||
4755 | int sched_setscheduler(struct task_struct *p, int policy, | ||
4756 | struct sched_param *param) | ||
4757 | { | 5124 | { |
4758 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5125 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4759 | unsigned long flags; | 5126 | unsigned long flags; |
@@ -4785,7 +5152,7 @@ recheck: | |||
4785 | /* | 5152 | /* |
4786 | * Allow unprivileged RT tasks to decrease priority: | 5153 | * Allow unprivileged RT tasks to decrease priority: |
4787 | */ | 5154 | */ |
4788 | if (!capable(CAP_SYS_NICE)) { | 5155 | if (user && !capable(CAP_SYS_NICE)) { |
4789 | if (rt_policy(policy)) { | 5156 | if (rt_policy(policy)) { |
4790 | unsigned long rlim_rtprio; | 5157 | unsigned long rlim_rtprio; |
4791 | 5158 | ||
@@ -4816,18 +5183,22 @@ recheck: | |||
4816 | return -EPERM; | 5183 | return -EPERM; |
4817 | } | 5184 | } |
4818 | 5185 | ||
5186 | if (user) { | ||
4819 | #ifdef CONFIG_RT_GROUP_SCHED | 5187 | #ifdef CONFIG_RT_GROUP_SCHED |
4820 | /* | 5188 | /* |
4821 | * Do not allow realtime tasks into groups that have no runtime | 5189 | * Do not allow realtime tasks into groups that have no runtime |
4822 | * assigned. | 5190 | * assigned. |
4823 | */ | 5191 | */ |
4824 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5192 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4825 | return -EPERM; | 5193 | task_group(p)->rt_bandwidth.rt_runtime == 0) |
5194 | return -EPERM; | ||
4826 | #endif | 5195 | #endif |
4827 | 5196 | ||
4828 | retval = security_task_setscheduler(p, policy, param); | 5197 | retval = security_task_setscheduler(p, policy, param); |
4829 | if (retval) | 5198 | if (retval) |
4830 | return retval; | 5199 | return retval; |
5200 | } | ||
5201 | |||
4831 | /* | 5202 | /* |
4832 | * make sure no PI-waiters arrive (or leave) while we are | 5203 | * make sure no PI-waiters arrive (or leave) while we are |
4833 | * changing the priority of the task: | 5204 | * changing the priority of the task: |
@@ -4870,8 +5241,39 @@ recheck: | |||
4870 | 5241 | ||
4871 | return 0; | 5242 | return 0; |
4872 | } | 5243 | } |
5244 | |||
5245 | /** | ||
5246 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | ||
5247 | * @p: the task in question. | ||
5248 | * @policy: new policy. | ||
5249 | * @param: structure containing the new RT priority. | ||
5250 | * | ||
5251 | * NOTE that the task may be already dead. | ||
5252 | */ | ||
5253 | int sched_setscheduler(struct task_struct *p, int policy, | ||
5254 | struct sched_param *param) | ||
5255 | { | ||
5256 | return __sched_setscheduler(p, policy, param, true); | ||
5257 | } | ||
4873 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 5258 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
4874 | 5259 | ||
5260 | /** | ||
5261 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | ||
5262 | * @p: the task in question. | ||
5263 | * @policy: new policy. | ||
5264 | * @param: structure containing the new RT priority. | ||
5265 | * | ||
5266 | * Just like sched_setscheduler, only don't bother checking if the | ||
5267 | * current context has permission. For example, this is needed in | ||
5268 | * stop_machine(): we create temporary high priority worker threads, | ||
5269 | * but our caller might not have that capability. | ||
5270 | */ | ||
5271 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | ||
5272 | struct sched_param *param) | ||
5273 | { | ||
5274 | return __sched_setscheduler(p, policy, param, false); | ||
5275 | } | ||
5276 | |||
4875 | static int | 5277 | static int |
4876 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 5278 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4877 | { | 5279 | { |
@@ -5070,24 +5472,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
5070 | return sched_setaffinity(pid, &new_mask); | 5472 | return sched_setaffinity(pid, &new_mask); |
5071 | } | 5473 | } |
5072 | 5474 | ||
5073 | /* | ||
5074 | * Represents all cpu's present in the system | ||
5075 | * In systems capable of hotplug, this map could dynamically grow | ||
5076 | * as new cpu's are detected in the system via any platform specific | ||
5077 | * method, such as ACPI for e.g. | ||
5078 | */ | ||
5079 | |||
5080 | cpumask_t cpu_present_map __read_mostly; | ||
5081 | EXPORT_SYMBOL(cpu_present_map); | ||
5082 | |||
5083 | #ifndef CONFIG_SMP | ||
5084 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
5085 | EXPORT_SYMBOL(cpu_online_map); | ||
5086 | |||
5087 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
5088 | EXPORT_SYMBOL(cpu_possible_map); | ||
5089 | #endif | ||
5090 | |||
5091 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 5475 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
5092 | { | 5476 | { |
5093 | struct task_struct *p; | 5477 | struct task_struct *p; |
@@ -5384,7 +5768,7 @@ out_unlock: | |||
5384 | return retval; | 5768 | return retval; |
5385 | } | 5769 | } |
5386 | 5770 | ||
5387 | static const char stat_nam[] = "RSDTtZX"; | 5771 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; |
5388 | 5772 | ||
5389 | void sched_show_task(struct task_struct *p) | 5773 | void sched_show_task(struct task_struct *p) |
5390 | { | 5774 | { |
@@ -5525,6 +5909,8 @@ static inline void sched_init_granularity(void) | |||
5525 | sysctl_sched_latency = limit; | 5909 | sysctl_sched_latency = limit; |
5526 | 5910 | ||
5527 | sysctl_sched_wakeup_granularity *= factor; | 5911 | sysctl_sched_wakeup_granularity *= factor; |
5912 | |||
5913 | sysctl_sched_shares_ratelimit *= factor; | ||
5528 | } | 5914 | } |
5529 | 5915 | ||
5530 | #ifdef CONFIG_SMP | 5916 | #ifdef CONFIG_SMP |
@@ -5566,6 +5952,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) | |||
5566 | goto out; | 5952 | goto out; |
5567 | } | 5953 | } |
5568 | 5954 | ||
5955 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | ||
5956 | !cpus_equal(p->cpus_allowed, *new_mask))) { | ||
5957 | ret = -EINVAL; | ||
5958 | goto out; | ||
5959 | } | ||
5960 | |||
5569 | if (p->sched_class->set_cpus_allowed) | 5961 | if (p->sched_class->set_cpus_allowed) |
5570 | p->sched_class->set_cpus_allowed(p, new_mask); | 5962 | p->sched_class->set_cpus_allowed(p, new_mask); |
5571 | else { | 5963 | else { |
@@ -5608,7 +6000,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5608 | struct rq *rq_dest, *rq_src; | 6000 | struct rq *rq_dest, *rq_src; |
5609 | int ret = 0, on_rq; | 6001 | int ret = 0, on_rq; |
5610 | 6002 | ||
5611 | if (unlikely(cpu_is_offline(dest_cpu))) | 6003 | if (unlikely(!cpu_active(dest_cpu))) |
5612 | return ret; | 6004 | return ret; |
5613 | 6005 | ||
5614 | rq_src = cpu_rq(src_cpu); | 6006 | rq_src = cpu_rq(src_cpu); |
@@ -5617,10 +6009,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5617 | double_rq_lock(rq_src, rq_dest); | 6009 | double_rq_lock(rq_src, rq_dest); |
5618 | /* Already moved. */ | 6010 | /* Already moved. */ |
5619 | if (task_cpu(p) != src_cpu) | 6011 | if (task_cpu(p) != src_cpu) |
5620 | goto out; | 6012 | goto done; |
5621 | /* Affinity changed (again). */ | 6013 | /* Affinity changed (again). */ |
5622 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 6014 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5623 | goto out; | 6015 | goto fail; |
5624 | 6016 | ||
5625 | on_rq = p->se.on_rq; | 6017 | on_rq = p->se.on_rq; |
5626 | if (on_rq) | 6018 | if (on_rq) |
@@ -5629,10 +6021,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5629 | set_task_cpu(p, dest_cpu); | 6021 | set_task_cpu(p, dest_cpu); |
5630 | if (on_rq) { | 6022 | if (on_rq) { |
5631 | activate_task(rq_dest, p, 0); | 6023 | activate_task(rq_dest, p, 0); |
5632 | check_preempt_curr(rq_dest, p); | 6024 | check_preempt_curr(rq_dest, p, 0); |
5633 | } | 6025 | } |
6026 | done: | ||
5634 | ret = 1; | 6027 | ret = 1; |
5635 | out: | 6028 | fail: |
5636 | double_rq_unlock(rq_src, rq_dest); | 6029 | double_rq_unlock(rq_src, rq_dest); |
5637 | return ret; | 6030 | return ret; |
5638 | } | 6031 | } |
@@ -5882,6 +6275,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
5882 | next = pick_next_task(rq, rq->curr); | 6275 | next = pick_next_task(rq, rq->curr); |
5883 | if (!next) | 6276 | if (!next) |
5884 | break; | 6277 | break; |
6278 | next->sched_class->put_prev_task(rq, next); | ||
5885 | migrate_dead(dead_cpu, next); | 6279 | migrate_dead(dead_cpu, next); |
5886 | 6280 | ||
5887 | } | 6281 | } |
@@ -5952,7 +6346,7 @@ set_table_entry(struct ctl_table *entry, | |||
5952 | static struct ctl_table * | 6346 | static struct ctl_table * |
5953 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 6347 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5954 | { | 6348 | { |
5955 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 6349 | struct ctl_table *table = sd_alloc_ctl_entry(13); |
5956 | 6350 | ||
5957 | if (table == NULL) | 6351 | if (table == NULL) |
5958 | return NULL; | 6352 | return NULL; |
@@ -5980,7 +6374,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5980 | sizeof(int), 0644, proc_dointvec_minmax); | 6374 | sizeof(int), 0644, proc_dointvec_minmax); |
5981 | set_table_entry(&table[10], "flags", &sd->flags, | 6375 | set_table_entry(&table[10], "flags", &sd->flags, |
5982 | sizeof(int), 0644, proc_dointvec_minmax); | 6376 | sizeof(int), 0644, proc_dointvec_minmax); |
5983 | /* &table[11] is terminator */ | 6377 | set_table_entry(&table[11], "name", sd->name, |
6378 | CORENAME_MAX_SIZE, 0444, proc_dostring); | ||
6379 | /* &table[12] is terminator */ | ||
5984 | 6380 | ||
5985 | return table; | 6381 | return table; |
5986 | } | 6382 | } |
@@ -6053,6 +6449,36 @@ static void unregister_sched_domain_sysctl(void) | |||
6053 | } | 6449 | } |
6054 | #endif | 6450 | #endif |
6055 | 6451 | ||
6452 | static void set_rq_online(struct rq *rq) | ||
6453 | { | ||
6454 | if (!rq->online) { | ||
6455 | const struct sched_class *class; | ||
6456 | |||
6457 | cpu_set(rq->cpu, rq->rd->online); | ||
6458 | rq->online = 1; | ||
6459 | |||
6460 | for_each_class(class) { | ||
6461 | if (class->rq_online) | ||
6462 | class->rq_online(rq); | ||
6463 | } | ||
6464 | } | ||
6465 | } | ||
6466 | |||
6467 | static void set_rq_offline(struct rq *rq) | ||
6468 | { | ||
6469 | if (rq->online) { | ||
6470 | const struct sched_class *class; | ||
6471 | |||
6472 | for_each_class(class) { | ||
6473 | if (class->rq_offline) | ||
6474 | class->rq_offline(rq); | ||
6475 | } | ||
6476 | |||
6477 | cpu_clear(rq->cpu, rq->rd->online); | ||
6478 | rq->online = 0; | ||
6479 | } | ||
6480 | } | ||
6481 | |||
6056 | /* | 6482 | /* |
6057 | * migration_call - callback that gets triggered when a CPU is added. | 6483 | * migration_call - callback that gets triggered when a CPU is added. |
6058 | * Here we can start up the necessary migration thread for the new CPU. | 6484 | * Here we can start up the necessary migration thread for the new CPU. |
@@ -6090,7 +6516,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6090 | spin_lock_irqsave(&rq->lock, flags); | 6516 | spin_lock_irqsave(&rq->lock, flags); |
6091 | if (rq->rd) { | 6517 | if (rq->rd) { |
6092 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6518 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6093 | cpu_set(cpu, rq->rd->online); | 6519 | |
6520 | set_rq_online(rq); | ||
6094 | } | 6521 | } |
6095 | spin_unlock_irqrestore(&rq->lock, flags); | 6522 | spin_unlock_irqrestore(&rq->lock, flags); |
6096 | break; | 6523 | break; |
@@ -6151,7 +6578,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6151 | spin_lock_irqsave(&rq->lock, flags); | 6578 | spin_lock_irqsave(&rq->lock, flags); |
6152 | if (rq->rd) { | 6579 | if (rq->rd) { |
6153 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6580 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6154 | cpu_clear(cpu, rq->rd->online); | 6581 | set_rq_offline(rq); |
6155 | } | 6582 | } |
6156 | spin_unlock_irqrestore(&rq->lock, flags); | 6583 | spin_unlock_irqrestore(&rq->lock, flags); |
6157 | break; | 6584 | break; |
@@ -6168,7 +6595,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { | |||
6168 | .priority = 10 | 6595 | .priority = 10 |
6169 | }; | 6596 | }; |
6170 | 6597 | ||
6171 | void __init migration_init(void) | 6598 | static int __init migration_init(void) |
6172 | { | 6599 | { |
6173 | void *cpu = (void *)(long)smp_processor_id(); | 6600 | void *cpu = (void *)(long)smp_processor_id(); |
6174 | int err; | 6601 | int err; |
@@ -6178,13 +6605,38 @@ void __init migration_init(void) | |||
6178 | BUG_ON(err == NOTIFY_BAD); | 6605 | BUG_ON(err == NOTIFY_BAD); |
6179 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6606 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
6180 | register_cpu_notifier(&migration_notifier); | 6607 | register_cpu_notifier(&migration_notifier); |
6608 | |||
6609 | return err; | ||
6181 | } | 6610 | } |
6611 | early_initcall(migration_init); | ||
6182 | #endif | 6612 | #endif |
6183 | 6613 | ||
6184 | #ifdef CONFIG_SMP | 6614 | #ifdef CONFIG_SMP |
6185 | 6615 | ||
6186 | #ifdef CONFIG_SCHED_DEBUG | 6616 | #ifdef CONFIG_SCHED_DEBUG |
6187 | 6617 | ||
6618 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
6619 | { | ||
6620 | switch (lvl) { | ||
6621 | case SD_LV_NONE: | ||
6622 | return "NONE"; | ||
6623 | case SD_LV_SIBLING: | ||
6624 | return "SIBLING"; | ||
6625 | case SD_LV_MC: | ||
6626 | return "MC"; | ||
6627 | case SD_LV_CPU: | ||
6628 | return "CPU"; | ||
6629 | case SD_LV_NODE: | ||
6630 | return "NODE"; | ||
6631 | case SD_LV_ALLNODES: | ||
6632 | return "ALLNODES"; | ||
6633 | case SD_LV_MAX: | ||
6634 | return "MAX"; | ||
6635 | |||
6636 | } | ||
6637 | return "MAX"; | ||
6638 | } | ||
6639 | |||
6188 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6640 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6189 | cpumask_t *groupmask) | 6641 | cpumask_t *groupmask) |
6190 | { | 6642 | { |
@@ -6204,7 +6656,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6204 | return -1; | 6656 | return -1; |
6205 | } | 6657 | } |
6206 | 6658 | ||
6207 | printk(KERN_CONT "span %s\n", str); | 6659 | printk(KERN_CONT "span %s level %s\n", |
6660 | str, sd_level_to_string(sd->level)); | ||
6208 | 6661 | ||
6209 | if (!cpu_isset(cpu, sd->span)) { | 6662 | if (!cpu_isset(cpu, sd->span)) { |
6210 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6663 | printk(KERN_ERR "ERROR: domain->span does not contain " |
@@ -6288,9 +6741,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6288 | } | 6741 | } |
6289 | kfree(groupmask); | 6742 | kfree(groupmask); |
6290 | } | 6743 | } |
6291 | #else | 6744 | #else /* !CONFIG_SCHED_DEBUG */ |
6292 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6745 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6293 | #endif | 6746 | #endif /* CONFIG_SCHED_DEBUG */ |
6294 | 6747 | ||
6295 | static int sd_degenerate(struct sched_domain *sd) | 6748 | static int sd_degenerate(struct sched_domain *sd) |
6296 | { | 6749 | { |
@@ -6350,20 +6803,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6350 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6803 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6351 | { | 6804 | { |
6352 | unsigned long flags; | 6805 | unsigned long flags; |
6353 | const struct sched_class *class; | ||
6354 | 6806 | ||
6355 | spin_lock_irqsave(&rq->lock, flags); | 6807 | spin_lock_irqsave(&rq->lock, flags); |
6356 | 6808 | ||
6357 | if (rq->rd) { | 6809 | if (rq->rd) { |
6358 | struct root_domain *old_rd = rq->rd; | 6810 | struct root_domain *old_rd = rq->rd; |
6359 | 6811 | ||
6360 | for (class = sched_class_highest; class; class = class->next) { | 6812 | if (cpu_isset(rq->cpu, old_rd->online)) |
6361 | if (class->leave_domain) | 6813 | set_rq_offline(rq); |
6362 | class->leave_domain(rq); | ||
6363 | } | ||
6364 | 6814 | ||
6365 | cpu_clear(rq->cpu, old_rd->span); | 6815 | cpu_clear(rq->cpu, old_rd->span); |
6366 | cpu_clear(rq->cpu, old_rd->online); | ||
6367 | 6816 | ||
6368 | if (atomic_dec_and_test(&old_rd->refcount)) | 6817 | if (atomic_dec_and_test(&old_rd->refcount)) |
6369 | kfree(old_rd); | 6818 | kfree(old_rd); |
@@ -6374,12 +6823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6374 | 6823 | ||
6375 | cpu_set(rq->cpu, rd->span); | 6824 | cpu_set(rq->cpu, rd->span); |
6376 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6825 | if (cpu_isset(rq->cpu, cpu_online_map)) |
6377 | cpu_set(rq->cpu, rd->online); | 6826 | set_rq_online(rq); |
6378 | |||
6379 | for (class = sched_class_highest; class; class = class->next) { | ||
6380 | if (class->join_domain) | ||
6381 | class->join_domain(rq); | ||
6382 | } | ||
6383 | 6827 | ||
6384 | spin_unlock_irqrestore(&rq->lock, flags); | 6828 | spin_unlock_irqrestore(&rq->lock, flags); |
6385 | } | 6829 | } |
@@ -6390,6 +6834,8 @@ static void init_rootdomain(struct root_domain *rd) | |||
6390 | 6834 | ||
6391 | cpus_clear(rd->span); | 6835 | cpus_clear(rd->span); |
6392 | cpus_clear(rd->online); | 6836 | cpus_clear(rd->online); |
6837 | |||
6838 | cpupri_init(&rd->cpupri); | ||
6393 | } | 6839 | } |
6394 | 6840 | ||
6395 | static void init_defrootdomain(void) | 6841 | static void init_defrootdomain(void) |
@@ -6451,7 +6897,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE; | |||
6451 | /* Setup the mask of cpus configured for isolated domains */ | 6897 | /* Setup the mask of cpus configured for isolated domains */ |
6452 | static int __init isolated_cpu_setup(char *str) | 6898 | static int __init isolated_cpu_setup(char *str) |
6453 | { | 6899 | { |
6454 | int ints[NR_CPUS], i; | 6900 | static int __initdata ints[NR_CPUS]; |
6901 | int i; | ||
6455 | 6902 | ||
6456 | str = get_options(str, ARRAY_SIZE(ints), ints); | 6903 | str = get_options(str, ARRAY_SIZE(ints), ints); |
6457 | cpus_clear(cpu_isolated_map); | 6904 | cpus_clear(cpu_isolated_map); |
@@ -6485,7 +6932,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, | |||
6485 | 6932 | ||
6486 | cpus_clear(*covered); | 6933 | cpus_clear(*covered); |
6487 | 6934 | ||
6488 | for_each_cpu_mask(i, *span) { | 6935 | for_each_cpu_mask_nr(i, *span) { |
6489 | struct sched_group *sg; | 6936 | struct sched_group *sg; |
6490 | int group = group_fn(i, cpu_map, &sg, tmpmask); | 6937 | int group = group_fn(i, cpu_map, &sg, tmpmask); |
6491 | int j; | 6938 | int j; |
@@ -6496,7 +6943,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, | |||
6496 | cpus_clear(sg->cpumask); | 6943 | cpus_clear(sg->cpumask); |
6497 | sg->__cpu_power = 0; | 6944 | sg->__cpu_power = 0; |
6498 | 6945 | ||
6499 | for_each_cpu_mask(j, *span) { | 6946 | for_each_cpu_mask_nr(j, *span) { |
6500 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 6947 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
6501 | continue; | 6948 | continue; |
6502 | 6949 | ||
@@ -6532,9 +6979,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6532 | 6979 | ||
6533 | min_val = INT_MAX; | 6980 | min_val = INT_MAX; |
6534 | 6981 | ||
6535 | for (i = 0; i < MAX_NUMNODES; i++) { | 6982 | for (i = 0; i < nr_node_ids; i++) { |
6536 | /* Start at @node */ | 6983 | /* Start at @node */ |
6537 | n = (node + i) % MAX_NUMNODES; | 6984 | n = (node + i) % nr_node_ids; |
6538 | 6985 | ||
6539 | if (!nr_cpus_node(n)) | 6986 | if (!nr_cpus_node(n)) |
6540 | continue; | 6987 | continue; |
@@ -6584,7 +7031,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) | |||
6584 | cpus_or(*span, *span, *nodemask); | 7031 | cpus_or(*span, *span, *nodemask); |
6585 | } | 7032 | } |
6586 | } | 7033 | } |
6587 | #endif | 7034 | #endif /* CONFIG_NUMA */ |
6588 | 7035 | ||
6589 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 7036 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6590 | 7037 | ||
@@ -6603,7 +7050,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6603 | *sg = &per_cpu(sched_group_cpus, cpu); | 7050 | *sg = &per_cpu(sched_group_cpus, cpu); |
6604 | return cpu; | 7051 | return cpu; |
6605 | } | 7052 | } |
6606 | #endif | 7053 | #endif /* CONFIG_SCHED_SMT */ |
6607 | 7054 | ||
6608 | /* | 7055 | /* |
6609 | * multi-core sched-domains: | 7056 | * multi-core sched-domains: |
@@ -6611,7 +7058,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6611 | #ifdef CONFIG_SCHED_MC | 7058 | #ifdef CONFIG_SCHED_MC |
6612 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 7059 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6613 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 7060 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6614 | #endif | 7061 | #endif /* CONFIG_SCHED_MC */ |
6615 | 7062 | ||
6616 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7063 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6617 | static int | 7064 | static int |
@@ -6696,7 +7143,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6696 | if (!sg) | 7143 | if (!sg) |
6697 | return; | 7144 | return; |
6698 | do { | 7145 | do { |
6699 | for_each_cpu_mask(j, sg->cpumask) { | 7146 | for_each_cpu_mask_nr(j, sg->cpumask) { |
6700 | struct sched_domain *sd; | 7147 | struct sched_domain *sd; |
6701 | 7148 | ||
6702 | sd = &per_cpu(phys_domains, j); | 7149 | sd = &per_cpu(phys_domains, j); |
@@ -6713,7 +7160,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6713 | sg = sg->next; | 7160 | sg = sg->next; |
6714 | } while (sg != group_head); | 7161 | } while (sg != group_head); |
6715 | } | 7162 | } |
6716 | #endif | 7163 | #endif /* CONFIG_NUMA */ |
6717 | 7164 | ||
6718 | #ifdef CONFIG_NUMA | 7165 | #ifdef CONFIG_NUMA |
6719 | /* Free memory allocated for various sched_group structures */ | 7166 | /* Free memory allocated for various sched_group structures */ |
@@ -6721,14 +7168,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | |||
6721 | { | 7168 | { |
6722 | int cpu, i; | 7169 | int cpu, i; |
6723 | 7170 | ||
6724 | for_each_cpu_mask(cpu, *cpu_map) { | 7171 | for_each_cpu_mask_nr(cpu, *cpu_map) { |
6725 | struct sched_group **sched_group_nodes | 7172 | struct sched_group **sched_group_nodes |
6726 | = sched_group_nodes_bycpu[cpu]; | 7173 | = sched_group_nodes_bycpu[cpu]; |
6727 | 7174 | ||
6728 | if (!sched_group_nodes) | 7175 | if (!sched_group_nodes) |
6729 | continue; | 7176 | continue; |
6730 | 7177 | ||
6731 | for (i = 0; i < MAX_NUMNODES; i++) { | 7178 | for (i = 0; i < nr_node_ids; i++) { |
6732 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7179 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6733 | 7180 | ||
6734 | *nodemask = node_to_cpumask(i); | 7181 | *nodemask = node_to_cpumask(i); |
@@ -6750,11 +7197,11 @@ next_sg: | |||
6750 | sched_group_nodes_bycpu[cpu] = NULL; | 7197 | sched_group_nodes_bycpu[cpu] = NULL; |
6751 | } | 7198 | } |
6752 | } | 7199 | } |
6753 | #else | 7200 | #else /* !CONFIG_NUMA */ |
6754 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | 7201 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6755 | { | 7202 | { |
6756 | } | 7203 | } |
6757 | #endif | 7204 | #endif /* CONFIG_NUMA */ |
6758 | 7205 | ||
6759 | /* | 7206 | /* |
6760 | * Initialize sched groups cpu_power. | 7207 | * Initialize sched groups cpu_power. |
@@ -6813,13 +7260,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6813 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 7260 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
6814 | */ | 7261 | */ |
6815 | 7262 | ||
7263 | #ifdef CONFIG_SCHED_DEBUG | ||
7264 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
7265 | #else | ||
7266 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
7267 | #endif | ||
7268 | |||
6816 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7269 | #define SD_INIT(sd, type) sd_init_##type(sd) |
7270 | |||
6817 | #define SD_INIT_FUNC(type) \ | 7271 | #define SD_INIT_FUNC(type) \ |
6818 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7272 | static noinline void sd_init_##type(struct sched_domain *sd) \ |
6819 | { \ | 7273 | { \ |
6820 | memset(sd, 0, sizeof(*sd)); \ | 7274 | memset(sd, 0, sizeof(*sd)); \ |
6821 | *sd = SD_##type##_INIT; \ | 7275 | *sd = SD_##type##_INIT; \ |
6822 | sd->level = SD_LV_##type; \ | 7276 | sd->level = SD_LV_##type; \ |
7277 | SD_INIT_NAME(sd, type); \ | ||
6823 | } | 7278 | } |
6824 | 7279 | ||
6825 | SD_INIT_FUNC(CPU) | 7280 | SD_INIT_FUNC(CPU) |
@@ -6921,7 +7376,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6921 | /* | 7376 | /* |
6922 | * Allocate the per-node list of sched groups | 7377 | * Allocate the per-node list of sched groups |
6923 | */ | 7378 | */ |
6924 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 7379 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), |
6925 | GFP_KERNEL); | 7380 | GFP_KERNEL); |
6926 | if (!sched_group_nodes) { | 7381 | if (!sched_group_nodes) { |
6927 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7382 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -6960,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6960 | /* | 7415 | /* |
6961 | * Set up domains for cpus specified by the cpu_map. | 7416 | * Set up domains for cpus specified by the cpu_map. |
6962 | */ | 7417 | */ |
6963 | for_each_cpu_mask(i, *cpu_map) { | 7418 | for_each_cpu_mask_nr(i, *cpu_map) { |
6964 | struct sched_domain *sd = NULL, *p; | 7419 | struct sched_domain *sd = NULL, *p; |
6965 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7420 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
6966 | 7421 | ||
@@ -7027,7 +7482,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7027 | 7482 | ||
7028 | #ifdef CONFIG_SCHED_SMT | 7483 | #ifdef CONFIG_SCHED_SMT |
7029 | /* Set up CPU (sibling) groups */ | 7484 | /* Set up CPU (sibling) groups */ |
7030 | for_each_cpu_mask(i, *cpu_map) { | 7485 | for_each_cpu_mask_nr(i, *cpu_map) { |
7031 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); | 7486 | SCHED_CPUMASK_VAR(this_sibling_map, allmasks); |
7032 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7487 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7033 | 7488 | ||
@@ -7044,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7044 | 7499 | ||
7045 | #ifdef CONFIG_SCHED_MC | 7500 | #ifdef CONFIG_SCHED_MC |
7046 | /* Set up multi-core groups */ | 7501 | /* Set up multi-core groups */ |
7047 | for_each_cpu_mask(i, *cpu_map) { | 7502 | for_each_cpu_mask_nr(i, *cpu_map) { |
7048 | SCHED_CPUMASK_VAR(this_core_map, allmasks); | 7503 | SCHED_CPUMASK_VAR(this_core_map, allmasks); |
7049 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7504 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7050 | 7505 | ||
@@ -7060,7 +7515,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7060 | #endif | 7515 | #endif |
7061 | 7516 | ||
7062 | /* Set up physical groups */ | 7517 | /* Set up physical groups */ |
7063 | for (i = 0; i < MAX_NUMNODES; i++) { | 7518 | for (i = 0; i < nr_node_ids; i++) { |
7064 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7519 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7065 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7520 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7066 | 7521 | ||
@@ -7084,7 +7539,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7084 | send_covered, tmpmask); | 7539 | send_covered, tmpmask); |
7085 | } | 7540 | } |
7086 | 7541 | ||
7087 | for (i = 0; i < MAX_NUMNODES; i++) { | 7542 | for (i = 0; i < nr_node_ids; i++) { |
7088 | /* Set up node groups */ | 7543 | /* Set up node groups */ |
7089 | struct sched_group *sg, *prev; | 7544 | struct sched_group *sg, *prev; |
7090 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7545 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
@@ -7111,7 +7566,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7111 | goto error; | 7566 | goto error; |
7112 | } | 7567 | } |
7113 | sched_group_nodes[i] = sg; | 7568 | sched_group_nodes[i] = sg; |
7114 | for_each_cpu_mask(j, *nodemask) { | 7569 | for_each_cpu_mask_nr(j, *nodemask) { |
7115 | struct sched_domain *sd; | 7570 | struct sched_domain *sd; |
7116 | 7571 | ||
7117 | sd = &per_cpu(node_domains, j); | 7572 | sd = &per_cpu(node_domains, j); |
@@ -7123,9 +7578,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7123 | cpus_or(*covered, *covered, *nodemask); | 7578 | cpus_or(*covered, *covered, *nodemask); |
7124 | prev = sg; | 7579 | prev = sg; |
7125 | 7580 | ||
7126 | for (j = 0; j < MAX_NUMNODES; j++) { | 7581 | for (j = 0; j < nr_node_ids; j++) { |
7127 | SCHED_CPUMASK_VAR(notcovered, allmasks); | 7582 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
7128 | int n = (i + j) % MAX_NUMNODES; | 7583 | int n = (i + j) % nr_node_ids; |
7129 | node_to_cpumask_ptr(pnodemask, n); | 7584 | node_to_cpumask_ptr(pnodemask, n); |
7130 | 7585 | ||
7131 | cpus_complement(*notcovered, *covered); | 7586 | cpus_complement(*notcovered, *covered); |
@@ -7157,28 +7612,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7157 | 7612 | ||
7158 | /* Calculate CPU power for physical packages and nodes */ | 7613 | /* Calculate CPU power for physical packages and nodes */ |
7159 | #ifdef CONFIG_SCHED_SMT | 7614 | #ifdef CONFIG_SCHED_SMT |
7160 | for_each_cpu_mask(i, *cpu_map) { | 7615 | for_each_cpu_mask_nr(i, *cpu_map) { |
7161 | struct sched_domain *sd = &per_cpu(cpu_domains, i); | 7616 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
7162 | 7617 | ||
7163 | init_sched_groups_power(i, sd); | 7618 | init_sched_groups_power(i, sd); |
7164 | } | 7619 | } |
7165 | #endif | 7620 | #endif |
7166 | #ifdef CONFIG_SCHED_MC | 7621 | #ifdef CONFIG_SCHED_MC |
7167 | for_each_cpu_mask(i, *cpu_map) { | 7622 | for_each_cpu_mask_nr(i, *cpu_map) { |
7168 | struct sched_domain *sd = &per_cpu(core_domains, i); | 7623 | struct sched_domain *sd = &per_cpu(core_domains, i); |
7169 | 7624 | ||
7170 | init_sched_groups_power(i, sd); | 7625 | init_sched_groups_power(i, sd); |
7171 | } | 7626 | } |
7172 | #endif | 7627 | #endif |
7173 | 7628 | ||
7174 | for_each_cpu_mask(i, *cpu_map) { | 7629 | for_each_cpu_mask_nr(i, *cpu_map) { |
7175 | struct sched_domain *sd = &per_cpu(phys_domains, i); | 7630 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
7176 | 7631 | ||
7177 | init_sched_groups_power(i, sd); | 7632 | init_sched_groups_power(i, sd); |
7178 | } | 7633 | } |
7179 | 7634 | ||
7180 | #ifdef CONFIG_NUMA | 7635 | #ifdef CONFIG_NUMA |
7181 | for (i = 0; i < MAX_NUMNODES; i++) | 7636 | for (i = 0; i < nr_node_ids; i++) |
7182 | init_numa_sched_groups_power(sched_group_nodes[i]); | 7637 | init_numa_sched_groups_power(sched_group_nodes[i]); |
7183 | 7638 | ||
7184 | if (sd_allnodes) { | 7639 | if (sd_allnodes) { |
@@ -7191,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7191 | #endif | 7646 | #endif |
7192 | 7647 | ||
7193 | /* Attach the domains */ | 7648 | /* Attach the domains */ |
7194 | for_each_cpu_mask(i, *cpu_map) { | 7649 | for_each_cpu_mask_nr(i, *cpu_map) { |
7195 | struct sched_domain *sd; | 7650 | struct sched_domain *sd; |
7196 | #ifdef CONFIG_SCHED_SMT | 7651 | #ifdef CONFIG_SCHED_SMT |
7197 | sd = &per_cpu(cpu_domains, i); | 7652 | sd = &per_cpu(cpu_domains, i); |
@@ -7236,18 +7691,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void) | |||
7236 | } | 7691 | } |
7237 | 7692 | ||
7238 | /* | 7693 | /* |
7239 | * Free current domain masks. | ||
7240 | * Called after all cpus are attached to NULL domain. | ||
7241 | */ | ||
7242 | static void free_sched_domains(void) | ||
7243 | { | ||
7244 | ndoms_cur = 0; | ||
7245 | if (doms_cur != &fallback_doms) | ||
7246 | kfree(doms_cur); | ||
7247 | doms_cur = &fallback_doms; | ||
7248 | } | ||
7249 | |||
7250 | /* | ||
7251 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 7694 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
7252 | * For now this just excludes isolated cpus, but could be used to | 7695 | * For now this just excludes isolated cpus, but could be used to |
7253 | * exclude other special cases in the future. | 7696 | * exclude other special cases in the future. |
@@ -7286,7 +7729,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
7286 | 7729 | ||
7287 | unregister_sched_domain_sysctl(); | 7730 | unregister_sched_domain_sysctl(); |
7288 | 7731 | ||
7289 | for_each_cpu_mask(i, *cpu_map) | 7732 | for_each_cpu_mask_nr(i, *cpu_map) |
7290 | cpu_attach_domain(NULL, &def_root_domain, i); | 7733 | cpu_attach_domain(NULL, &def_root_domain, i); |
7291 | synchronize_sched(); | 7734 | synchronize_sched(); |
7292 | arch_destroy_sched_domains(cpu_map, &tmpmask); | 7735 | arch_destroy_sched_domains(cpu_map, &tmpmask); |
@@ -7325,30 +7768,29 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
7325 | * ownership of it and will kfree it when done with it. If the caller | 7768 | * ownership of it and will kfree it when done with it. If the caller |
7326 | * failed the kmalloc call, then it can pass in doms_new == NULL, | 7769 | * failed the kmalloc call, then it can pass in doms_new == NULL, |
7327 | * and partition_sched_domains() will fallback to the single partition | 7770 | * and partition_sched_domains() will fallback to the single partition |
7328 | * 'fallback_doms'. | 7771 | * 'fallback_doms', it also forces the domains to be rebuilt. |
7772 | * | ||
7773 | * If doms_new==NULL it will be replaced with cpu_online_map. | ||
7774 | * ndoms_new==0 is a special case for destroying existing domains. | ||
7775 | * It will not create the default domain. | ||
7329 | * | 7776 | * |
7330 | * Call with hotplug lock held | 7777 | * Call with hotplug lock held |
7331 | */ | 7778 | */ |
7332 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | 7779 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7333 | struct sched_domain_attr *dattr_new) | 7780 | struct sched_domain_attr *dattr_new) |
7334 | { | 7781 | { |
7335 | int i, j; | 7782 | int i, j, n; |
7336 | 7783 | ||
7337 | mutex_lock(&sched_domains_mutex); | 7784 | mutex_lock(&sched_domains_mutex); |
7338 | 7785 | ||
7339 | /* always unregister in case we don't destroy any domains */ | 7786 | /* always unregister in case we don't destroy any domains */ |
7340 | unregister_sched_domain_sysctl(); | 7787 | unregister_sched_domain_sysctl(); |
7341 | 7788 | ||
7342 | if (doms_new == NULL) { | 7789 | n = doms_new ? ndoms_new : 0; |
7343 | ndoms_new = 1; | ||
7344 | doms_new = &fallback_doms; | ||
7345 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | ||
7346 | dattr_new = NULL; | ||
7347 | } | ||
7348 | 7790 | ||
7349 | /* Destroy deleted domains */ | 7791 | /* Destroy deleted domains */ |
7350 | for (i = 0; i < ndoms_cur; i++) { | 7792 | for (i = 0; i < ndoms_cur; i++) { |
7351 | for (j = 0; j < ndoms_new; j++) { | 7793 | for (j = 0; j < n; j++) { |
7352 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7794 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7353 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7795 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7354 | goto match1; | 7796 | goto match1; |
@@ -7359,6 +7801,13 @@ match1: | |||
7359 | ; | 7801 | ; |
7360 | } | 7802 | } |
7361 | 7803 | ||
7804 | if (doms_new == NULL) { | ||
7805 | ndoms_cur = 0; | ||
7806 | doms_new = &fallback_doms; | ||
7807 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | ||
7808 | dattr_new = NULL; | ||
7809 | } | ||
7810 | |||
7362 | /* Build new domains */ | 7811 | /* Build new domains */ |
7363 | for (i = 0; i < ndoms_new; i++) { | 7812 | for (i = 0; i < ndoms_new; i++) { |
7364 | for (j = 0; j < ndoms_cur; j++) { | 7813 | for (j = 0; j < ndoms_cur; j++) { |
@@ -7389,17 +7838,15 @@ match2: | |||
7389 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7838 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7390 | int arch_reinit_sched_domains(void) | 7839 | int arch_reinit_sched_domains(void) |
7391 | { | 7840 | { |
7392 | int err; | ||
7393 | |||
7394 | get_online_cpus(); | 7841 | get_online_cpus(); |
7395 | mutex_lock(&sched_domains_mutex); | 7842 | |
7396 | detach_destroy_domains(&cpu_online_map); | 7843 | /* Destroy domains first to force the rebuild */ |
7397 | free_sched_domains(); | 7844 | partition_sched_domains(0, NULL, NULL); |
7398 | err = arch_init_sched_domains(&cpu_online_map); | 7845 | |
7399 | mutex_unlock(&sched_domains_mutex); | 7846 | rebuild_sched_domains(); |
7400 | put_online_cpus(); | 7847 | put_online_cpus(); |
7401 | 7848 | ||
7402 | return err; | 7849 | return 0; |
7403 | } | 7850 | } |
7404 | 7851 | ||
7405 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 7852 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
@@ -7420,30 +7867,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7420 | } | 7867 | } |
7421 | 7868 | ||
7422 | #ifdef CONFIG_SCHED_MC | 7869 | #ifdef CONFIG_SCHED_MC |
7423 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | 7870 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, |
7871 | char *page) | ||
7424 | { | 7872 | { |
7425 | return sprintf(page, "%u\n", sched_mc_power_savings); | 7873 | return sprintf(page, "%u\n", sched_mc_power_savings); |
7426 | } | 7874 | } |
7427 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | 7875 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, |
7428 | const char *buf, size_t count) | 7876 | const char *buf, size_t count) |
7429 | { | 7877 | { |
7430 | return sched_power_savings_store(buf, count, 0); | 7878 | return sched_power_savings_store(buf, count, 0); |
7431 | } | 7879 | } |
7432 | static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | 7880 | static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, |
7433 | sched_mc_power_savings_store); | 7881 | sched_mc_power_savings_show, |
7882 | sched_mc_power_savings_store); | ||
7434 | #endif | 7883 | #endif |
7435 | 7884 | ||
7436 | #ifdef CONFIG_SCHED_SMT | 7885 | #ifdef CONFIG_SCHED_SMT |
7437 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | 7886 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, |
7887 | char *page) | ||
7438 | { | 7888 | { |
7439 | return sprintf(page, "%u\n", sched_smt_power_savings); | 7889 | return sprintf(page, "%u\n", sched_smt_power_savings); |
7440 | } | 7890 | } |
7441 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | 7891 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, |
7442 | const char *buf, size_t count) | 7892 | const char *buf, size_t count) |
7443 | { | 7893 | { |
7444 | return sched_power_savings_store(buf, count, 1); | 7894 | return sched_power_savings_store(buf, count, 1); |
7445 | } | 7895 | } |
7446 | static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | 7896 | static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, |
7897 | sched_smt_power_savings_show, | ||
7447 | sched_smt_power_savings_store); | 7898 | sched_smt_power_savings_store); |
7448 | #endif | 7899 | #endif |
7449 | 7900 | ||
@@ -7463,54 +7914,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7463 | #endif | 7914 | #endif |
7464 | return err; | 7915 | return err; |
7465 | } | 7916 | } |
7466 | #endif | 7917 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7467 | 7918 | ||
7919 | #ifndef CONFIG_CPUSETS | ||
7468 | /* | 7920 | /* |
7469 | * Force a reinitialization of the sched domains hierarchy. The domains | 7921 | * Add online and remove offline CPUs from the scheduler domains. |
7470 | * and groups cannot be updated in place without racing with the balancing | 7922 | * When cpusets are enabled they take over this function. |
7471 | * code, so we temporarily attach all running cpus to the NULL domain | ||
7472 | * which will prevent rebalancing while the sched domains are recalculated. | ||
7473 | */ | 7923 | */ |
7474 | static int update_sched_domains(struct notifier_block *nfb, | 7924 | static int update_sched_domains(struct notifier_block *nfb, |
7475 | unsigned long action, void *hcpu) | 7925 | unsigned long action, void *hcpu) |
7476 | { | 7926 | { |
7477 | switch (action) { | 7927 | switch (action) { |
7478 | case CPU_UP_PREPARE: | 7928 | case CPU_ONLINE: |
7479 | case CPU_UP_PREPARE_FROZEN: | 7929 | case CPU_ONLINE_FROZEN: |
7930 | case CPU_DEAD: | ||
7931 | case CPU_DEAD_FROZEN: | ||
7932 | partition_sched_domains(1, NULL, NULL); | ||
7933 | return NOTIFY_OK; | ||
7934 | |||
7935 | default: | ||
7936 | return NOTIFY_DONE; | ||
7937 | } | ||
7938 | } | ||
7939 | #endif | ||
7940 | |||
7941 | static int update_runtime(struct notifier_block *nfb, | ||
7942 | unsigned long action, void *hcpu) | ||
7943 | { | ||
7944 | int cpu = (int)(long)hcpu; | ||
7945 | |||
7946 | switch (action) { | ||
7480 | case CPU_DOWN_PREPARE: | 7947 | case CPU_DOWN_PREPARE: |
7481 | case CPU_DOWN_PREPARE_FROZEN: | 7948 | case CPU_DOWN_PREPARE_FROZEN: |
7482 | detach_destroy_domains(&cpu_online_map); | 7949 | disable_runtime(cpu_rq(cpu)); |
7483 | free_sched_domains(); | ||
7484 | return NOTIFY_OK; | 7950 | return NOTIFY_OK; |
7485 | 7951 | ||
7486 | case CPU_UP_CANCELED: | ||
7487 | case CPU_UP_CANCELED_FROZEN: | ||
7488 | case CPU_DOWN_FAILED: | 7952 | case CPU_DOWN_FAILED: |
7489 | case CPU_DOWN_FAILED_FROZEN: | 7953 | case CPU_DOWN_FAILED_FROZEN: |
7490 | case CPU_ONLINE: | 7954 | case CPU_ONLINE: |
7491 | case CPU_ONLINE_FROZEN: | 7955 | case CPU_ONLINE_FROZEN: |
7492 | case CPU_DEAD: | 7956 | enable_runtime(cpu_rq(cpu)); |
7493 | case CPU_DEAD_FROZEN: | 7957 | return NOTIFY_OK; |
7494 | /* | 7958 | |
7495 | * Fall through and re-initialise the domains. | ||
7496 | */ | ||
7497 | break; | ||
7498 | default: | 7959 | default: |
7499 | return NOTIFY_DONE; | 7960 | return NOTIFY_DONE; |
7500 | } | 7961 | } |
7501 | |||
7502 | #ifndef CONFIG_CPUSETS | ||
7503 | /* | ||
7504 | * Create default domain partitioning if cpusets are disabled. | ||
7505 | * Otherwise we let cpusets rebuild the domains based on the | ||
7506 | * current setup. | ||
7507 | */ | ||
7508 | |||
7509 | /* The hotplug lock is already held by cpu_up/cpu_down */ | ||
7510 | arch_init_sched_domains(&cpu_online_map); | ||
7511 | #endif | ||
7512 | |||
7513 | return NOTIFY_OK; | ||
7514 | } | 7962 | } |
7515 | 7963 | ||
7516 | void __init sched_init_smp(void) | 7964 | void __init sched_init_smp(void) |
@@ -7530,8 +7978,15 @@ void __init sched_init_smp(void) | |||
7530 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7978 | cpu_set(smp_processor_id(), non_isolated_cpus); |
7531 | mutex_unlock(&sched_domains_mutex); | 7979 | mutex_unlock(&sched_domains_mutex); |
7532 | put_online_cpus(); | 7980 | put_online_cpus(); |
7981 | |||
7982 | #ifndef CONFIG_CPUSETS | ||
7533 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7983 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
7534 | hotcpu_notifier(update_sched_domains, 0); | 7984 | hotcpu_notifier(update_sched_domains, 0); |
7985 | #endif | ||
7986 | |||
7987 | /* RT runtime code needs to handle some hotplug events */ | ||
7988 | hotcpu_notifier(update_runtime, 0); | ||
7989 | |||
7535 | init_hrtick(); | 7990 | init_hrtick(); |
7536 | 7991 | ||
7537 | /* Move init over to a non-isolated CPU */ | 7992 | /* Move init over to a non-isolated CPU */ |
@@ -7688,8 +8143,8 @@ void __init sched_init(void) | |||
7688 | 8143 | ||
7689 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8144 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7690 | ptr += nr_cpu_ids * sizeof(void **); | 8145 | ptr += nr_cpu_ids * sizeof(void **); |
7691 | #endif | 8146 | #endif /* CONFIG_USER_SCHED */ |
7692 | #endif | 8147 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7693 | #ifdef CONFIG_RT_GROUP_SCHED | 8148 | #ifdef CONFIG_RT_GROUP_SCHED |
7694 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8149 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7695 | ptr += nr_cpu_ids * sizeof(void **); | 8150 | ptr += nr_cpu_ids * sizeof(void **); |
@@ -7703,8 +8158,8 @@ void __init sched_init(void) | |||
7703 | 8158 | ||
7704 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 8159 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7705 | ptr += nr_cpu_ids * sizeof(void **); | 8160 | ptr += nr_cpu_ids * sizeof(void **); |
7706 | #endif | 8161 | #endif /* CONFIG_USER_SCHED */ |
7707 | #endif | 8162 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7708 | } | 8163 | } |
7709 | 8164 | ||
7710 | #ifdef CONFIG_SMP | 8165 | #ifdef CONFIG_SMP |
@@ -7720,8 +8175,8 @@ void __init sched_init(void) | |||
7720 | #ifdef CONFIG_USER_SCHED | 8175 | #ifdef CONFIG_USER_SCHED |
7721 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 8176 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7722 | global_rt_period(), RUNTIME_INF); | 8177 | global_rt_period(), RUNTIME_INF); |
7723 | #endif | 8178 | #endif /* CONFIG_USER_SCHED */ |
7724 | #endif | 8179 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7725 | 8180 | ||
7726 | #ifdef CONFIG_GROUP_SCHED | 8181 | #ifdef CONFIG_GROUP_SCHED |
7727 | list_add(&init_task_group.list, &task_groups); | 8182 | list_add(&init_task_group.list, &task_groups); |
@@ -7731,15 +8186,14 @@ void __init sched_init(void) | |||
7731 | INIT_LIST_HEAD(&root_task_group.children); | 8186 | INIT_LIST_HEAD(&root_task_group.children); |
7732 | init_task_group.parent = &root_task_group; | 8187 | init_task_group.parent = &root_task_group; |
7733 | list_add(&init_task_group.siblings, &root_task_group.children); | 8188 | list_add(&init_task_group.siblings, &root_task_group.children); |
7734 | #endif | 8189 | #endif /* CONFIG_USER_SCHED */ |
7735 | #endif | 8190 | #endif /* CONFIG_GROUP_SCHED */ |
7736 | 8191 | ||
7737 | for_each_possible_cpu(i) { | 8192 | for_each_possible_cpu(i) { |
7738 | struct rq *rq; | 8193 | struct rq *rq; |
7739 | 8194 | ||
7740 | rq = cpu_rq(i); | 8195 | rq = cpu_rq(i); |
7741 | spin_lock_init(&rq->lock); | 8196 | spin_lock_init(&rq->lock); |
7742 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
7743 | rq->nr_running = 0; | 8197 | rq->nr_running = 0; |
7744 | init_cfs_rq(&rq->cfs, rq); | 8198 | init_cfs_rq(&rq->cfs, rq); |
7745 | init_rt_rq(&rq->rt, rq); | 8199 | init_rt_rq(&rq->rt, rq); |
@@ -7812,6 +8266,7 @@ void __init sched_init(void) | |||
7812 | rq->next_balance = jiffies; | 8266 | rq->next_balance = jiffies; |
7813 | rq->push_cpu = 0; | 8267 | rq->push_cpu = 0; |
7814 | rq->cpu = i; | 8268 | rq->cpu = i; |
8269 | rq->online = 0; | ||
7815 | rq->migration_thread = NULL; | 8270 | rq->migration_thread = NULL; |
7816 | INIT_LIST_HEAD(&rq->migration_queue); | 8271 | INIT_LIST_HEAD(&rq->migration_queue); |
7817 | rq_attach_root(rq, &def_root_domain); | 8272 | rq_attach_root(rq, &def_root_domain); |
@@ -7827,7 +8282,7 @@ void __init sched_init(void) | |||
7827 | #endif | 8282 | #endif |
7828 | 8283 | ||
7829 | #ifdef CONFIG_SMP | 8284 | #ifdef CONFIG_SMP |
7830 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8285 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
7831 | #endif | 8286 | #endif |
7832 | 8287 | ||
7833 | #ifdef CONFIG_RT_MUTEXES | 8288 | #ifdef CONFIG_RT_MUTEXES |
@@ -7861,20 +8316,25 @@ void __might_sleep(char *file, int line) | |||
7861 | #ifdef in_atomic | 8316 | #ifdef in_atomic |
7862 | static unsigned long prev_jiffy; /* ratelimiting */ | 8317 | static unsigned long prev_jiffy; /* ratelimiting */ |
7863 | 8318 | ||
7864 | if ((in_atomic() || irqs_disabled()) && | 8319 | if ((!in_atomic() && !irqs_disabled()) || |
7865 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 8320 | system_state != SYSTEM_RUNNING || oops_in_progress) |
7866 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8321 | return; |
7867 | return; | 8322 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
7868 | prev_jiffy = jiffies; | 8323 | return; |
7869 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 8324 | prev_jiffy = jiffies; |
7870 | " context at %s:%d\n", file, line); | 8325 | |
7871 | printk("in_atomic():%d, irqs_disabled():%d\n", | 8326 | printk(KERN_ERR |
7872 | in_atomic(), irqs_disabled()); | 8327 | "BUG: sleeping function called from invalid context at %s:%d\n", |
7873 | debug_show_held_locks(current); | 8328 | file, line); |
7874 | if (irqs_disabled()) | 8329 | printk(KERN_ERR |
7875 | print_irqtrace_events(current); | 8330 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
7876 | dump_stack(); | 8331 | in_atomic(), irqs_disabled(), |
7877 | } | 8332 | current->pid, current->comm); |
8333 | |||
8334 | debug_show_held_locks(current); | ||
8335 | if (irqs_disabled()) | ||
8336 | print_irqtrace_events(current); | ||
8337 | dump_stack(); | ||
7878 | #endif | 8338 | #endif |
7879 | } | 8339 | } |
7880 | EXPORT_SYMBOL(__might_sleep); | 8340 | EXPORT_SYMBOL(__might_sleep); |
@@ -8051,7 +8511,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
8051 | { | 8511 | { |
8052 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8512 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); |
8053 | } | 8513 | } |
8054 | #else | 8514 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8055 | static inline void free_fair_sched_group(struct task_group *tg) | 8515 | static inline void free_fair_sched_group(struct task_group *tg) |
8056 | { | 8516 | { |
8057 | } | 8517 | } |
@@ -8069,7 +8529,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) | |||
8069 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8529 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8070 | { | 8530 | { |
8071 | } | 8531 | } |
8072 | #endif | 8532 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8073 | 8533 | ||
8074 | #ifdef CONFIG_RT_GROUP_SCHED | 8534 | #ifdef CONFIG_RT_GROUP_SCHED |
8075 | static void free_rt_sched_group(struct task_group *tg) | 8535 | static void free_rt_sched_group(struct task_group *tg) |
@@ -8140,7 +8600,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
8140 | { | 8600 | { |
8141 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | 8601 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); |
8142 | } | 8602 | } |
8143 | #else | 8603 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8144 | static inline void free_rt_sched_group(struct task_group *tg) | 8604 | static inline void free_rt_sched_group(struct task_group *tg) |
8145 | { | 8605 | { |
8146 | } | 8606 | } |
@@ -8158,7 +8618,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) | |||
8158 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | 8618 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) |
8159 | { | 8619 | { |
8160 | } | 8620 | } |
8161 | #endif | 8621 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8162 | 8622 | ||
8163 | #ifdef CONFIG_GROUP_SCHED | 8623 | #ifdef CONFIG_GROUP_SCHED |
8164 | static void free_sched_group(struct task_group *tg) | 8624 | static void free_sched_group(struct task_group *tg) |
@@ -8195,8 +8655,8 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8195 | WARN_ON(!parent); /* root should already exist */ | 8655 | WARN_ON(!parent); /* root should already exist */ |
8196 | 8656 | ||
8197 | tg->parent = parent; | 8657 | tg->parent = parent; |
8198 | list_add_rcu(&tg->siblings, &parent->children); | ||
8199 | INIT_LIST_HEAD(&tg->children); | 8658 | INIT_LIST_HEAD(&tg->children); |
8659 | list_add_rcu(&tg->siblings, &parent->children); | ||
8200 | spin_unlock_irqrestore(&task_group_lock, flags); | 8660 | spin_unlock_irqrestore(&task_group_lock, flags); |
8201 | 8661 | ||
8202 | return tg; | 8662 | return tg; |
@@ -8269,17 +8729,14 @@ void sched_move_task(struct task_struct *tsk) | |||
8269 | 8729 | ||
8270 | task_rq_unlock(rq, &flags); | 8730 | task_rq_unlock(rq, &flags); |
8271 | } | 8731 | } |
8272 | #endif | 8732 | #endif /* CONFIG_GROUP_SCHED */ |
8273 | 8733 | ||
8274 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8734 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8275 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8735 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
8276 | { | 8736 | { |
8277 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8737 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8278 | struct rq *rq = cfs_rq->rq; | ||
8279 | int on_rq; | 8738 | int on_rq; |
8280 | 8739 | ||
8281 | spin_lock_irq(&rq->lock); | ||
8282 | |||
8283 | on_rq = se->on_rq; | 8740 | on_rq = se->on_rq; |
8284 | if (on_rq) | 8741 | if (on_rq) |
8285 | dequeue_entity(cfs_rq, se, 0); | 8742 | dequeue_entity(cfs_rq, se, 0); |
@@ -8289,8 +8746,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8289 | 8746 | ||
8290 | if (on_rq) | 8747 | if (on_rq) |
8291 | enqueue_entity(cfs_rq, se, 0); | 8748 | enqueue_entity(cfs_rq, se, 0); |
8749 | } | ||
8292 | 8750 | ||
8293 | spin_unlock_irq(&rq->lock); | 8751 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8752 | { | ||
8753 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8754 | struct rq *rq = cfs_rq->rq; | ||
8755 | unsigned long flags; | ||
8756 | |||
8757 | spin_lock_irqsave(&rq->lock, flags); | ||
8758 | __set_se_shares(se, shares); | ||
8759 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8294 | } | 8760 | } |
8295 | 8761 | ||
8296 | static DEFINE_MUTEX(shares_mutex); | 8762 | static DEFINE_MUTEX(shares_mutex); |
@@ -8329,8 +8795,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8329 | * w/o tripping rebalance_share or load_balance_fair. | 8795 | * w/o tripping rebalance_share or load_balance_fair. |
8330 | */ | 8796 | */ |
8331 | tg->shares = shares; | 8797 | tg->shares = shares; |
8332 | for_each_possible_cpu(i) | 8798 | for_each_possible_cpu(i) { |
8799 | /* | ||
8800 | * force a rebalance | ||
8801 | */ | ||
8802 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8333 | set_se_shares(tg->se[i], shares); | 8803 | set_se_shares(tg->se[i], shares); |
8804 | } | ||
8334 | 8805 | ||
8335 | /* | 8806 | /* |
8336 | * Enable load balance activity on this group, by inserting it back on | 8807 | * Enable load balance activity on this group, by inserting it back on |
@@ -8361,73 +8832,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); | |||
8361 | static unsigned long to_ratio(u64 period, u64 runtime) | 8832 | static unsigned long to_ratio(u64 period, u64 runtime) |
8362 | { | 8833 | { |
8363 | if (runtime == RUNTIME_INF) | 8834 | if (runtime == RUNTIME_INF) |
8364 | return 1ULL << 16; | 8835 | return 1ULL << 20; |
8365 | 8836 | ||
8366 | return div64_u64(runtime << 16, period); | 8837 | return div64_u64(runtime << 20, period); |
8367 | } | 8838 | } |
8368 | 8839 | ||
8369 | #ifdef CONFIG_CGROUP_SCHED | 8840 | /* Must be called with tasklist_lock held */ |
8370 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8841 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8371 | { | 8842 | { |
8372 | struct task_group *tgi, *parent = tg ? tg->parent : NULL; | 8843 | struct task_struct *g, *p; |
8373 | unsigned long total = 0; | ||
8374 | 8844 | ||
8375 | if (!parent) { | 8845 | do_each_thread(g, p) { |
8376 | if (global_rt_period() < period) | 8846 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) |
8377 | return 0; | 8847 | return 1; |
8848 | } while_each_thread(g, p); | ||
8378 | 8849 | ||
8379 | return to_ratio(period, runtime) < | 8850 | return 0; |
8380 | to_ratio(global_rt_period(), global_rt_runtime()); | 8851 | } |
8381 | } | ||
8382 | 8852 | ||
8383 | if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) | 8853 | struct rt_schedulable_data { |
8384 | return 0; | 8854 | struct task_group *tg; |
8855 | u64 rt_period; | ||
8856 | u64 rt_runtime; | ||
8857 | }; | ||
8385 | 8858 | ||
8386 | rcu_read_lock(); | 8859 | static int tg_schedulable(struct task_group *tg, void *data) |
8387 | list_for_each_entry_rcu(tgi, &parent->children, siblings) { | 8860 | { |
8388 | if (tgi == tg) | 8861 | struct rt_schedulable_data *d = data; |
8389 | continue; | 8862 | struct task_group *child; |
8863 | unsigned long total, sum = 0; | ||
8864 | u64 period, runtime; | ||
8865 | |||
8866 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8867 | runtime = tg->rt_bandwidth.rt_runtime; | ||
8390 | 8868 | ||
8391 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8869 | if (tg == d->tg) { |
8392 | tgi->rt_bandwidth.rt_runtime); | 8870 | period = d->rt_period; |
8871 | runtime = d->rt_runtime; | ||
8393 | } | 8872 | } |
8394 | rcu_read_unlock(); | ||
8395 | 8873 | ||
8396 | return total + to_ratio(period, runtime) < | 8874 | /* |
8397 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8875 | * Cannot have more runtime than the period. |
8398 | parent->rt_bandwidth.rt_runtime); | 8876 | */ |
8399 | } | 8877 | if (runtime > period && runtime != RUNTIME_INF) |
8400 | #elif defined CONFIG_USER_SCHED | 8878 | return -EINVAL; |
8401 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
8402 | { | ||
8403 | struct task_group *tgi; | ||
8404 | unsigned long total = 0; | ||
8405 | unsigned long global_ratio = | ||
8406 | to_ratio(global_rt_period(), global_rt_runtime()); | ||
8407 | 8879 | ||
8408 | rcu_read_lock(); | 8880 | /* |
8409 | list_for_each_entry_rcu(tgi, &task_groups, list) { | 8881 | * Ensure we don't starve existing RT tasks. |
8410 | if (tgi == tg) | 8882 | */ |
8411 | continue; | 8883 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) |
8884 | return -EBUSY; | ||
8885 | |||
8886 | total = to_ratio(period, runtime); | ||
8412 | 8887 | ||
8413 | total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), | 8888 | /* |
8414 | tgi->rt_bandwidth.rt_runtime); | 8889 | * Nobody can have more than the global setting allows. |
8890 | */ | ||
8891 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
8892 | return -EINVAL; | ||
8893 | |||
8894 | /* | ||
8895 | * The sum of our children's runtime should not exceed our own. | ||
8896 | */ | ||
8897 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
8898 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
8899 | runtime = child->rt_bandwidth.rt_runtime; | ||
8900 | |||
8901 | if (child == d->tg) { | ||
8902 | period = d->rt_period; | ||
8903 | runtime = d->rt_runtime; | ||
8904 | } | ||
8905 | |||
8906 | sum += to_ratio(period, runtime); | ||
8415 | } | 8907 | } |
8416 | rcu_read_unlock(); | ||
8417 | 8908 | ||
8418 | return total + to_ratio(period, runtime) < global_ratio; | 8909 | if (sum > total) |
8910 | return -EINVAL; | ||
8911 | |||
8912 | return 0; | ||
8419 | } | 8913 | } |
8420 | #endif | ||
8421 | 8914 | ||
8422 | /* Must be called with tasklist_lock held */ | 8915 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8423 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
8424 | { | 8916 | { |
8425 | struct task_struct *g, *p; | 8917 | struct rt_schedulable_data data = { |
8426 | do_each_thread(g, p) { | 8918 | .tg = tg, |
8427 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 8919 | .rt_period = period, |
8428 | return 1; | 8920 | .rt_runtime = runtime, |
8429 | } while_each_thread(g, p); | 8921 | }; |
8430 | return 0; | 8922 | |
8923 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | ||
8431 | } | 8924 | } |
8432 | 8925 | ||
8433 | static int tg_set_bandwidth(struct task_group *tg, | 8926 | static int tg_set_bandwidth(struct task_group *tg, |
@@ -8437,14 +8930,9 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8437 | 8930 | ||
8438 | mutex_lock(&rt_constraints_mutex); | 8931 | mutex_lock(&rt_constraints_mutex); |
8439 | read_lock(&tasklist_lock); | 8932 | read_lock(&tasklist_lock); |
8440 | if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { | 8933 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
8441 | err = -EBUSY; | 8934 | if (err) |
8442 | goto unlock; | 8935 | goto unlock; |
8443 | } | ||
8444 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | ||
8445 | err = -EINVAL; | ||
8446 | goto unlock; | ||
8447 | } | ||
8448 | 8936 | ||
8449 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8937 | spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8450 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | 8938 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); |
@@ -8496,6 +8984,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8496 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 8984 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
8497 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 8985 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
8498 | 8986 | ||
8987 | if (rt_period == 0) | ||
8988 | return -EINVAL; | ||
8989 | |||
8499 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8990 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8500 | } | 8991 | } |
8501 | 8992 | ||
@@ -8510,21 +9001,38 @@ long sched_group_rt_period(struct task_group *tg) | |||
8510 | 9001 | ||
8511 | static int sched_rt_global_constraints(void) | 9002 | static int sched_rt_global_constraints(void) |
8512 | { | 9003 | { |
9004 | u64 runtime, period; | ||
8513 | int ret = 0; | 9005 | int ret = 0; |
8514 | 9006 | ||
9007 | if (sysctl_sched_rt_period <= 0) | ||
9008 | return -EINVAL; | ||
9009 | |||
9010 | runtime = global_rt_runtime(); | ||
9011 | period = global_rt_period(); | ||
9012 | |||
9013 | /* | ||
9014 | * Sanity check on the sysctl variables. | ||
9015 | */ | ||
9016 | if (runtime > period && runtime != RUNTIME_INF) | ||
9017 | return -EINVAL; | ||
9018 | |||
8515 | mutex_lock(&rt_constraints_mutex); | 9019 | mutex_lock(&rt_constraints_mutex); |
8516 | if (!__rt_schedulable(NULL, 1, 0)) | 9020 | read_lock(&tasklist_lock); |
8517 | ret = -EINVAL; | 9021 | ret = __rt_schedulable(NULL, 0, 0); |
9022 | read_unlock(&tasklist_lock); | ||
8518 | mutex_unlock(&rt_constraints_mutex); | 9023 | mutex_unlock(&rt_constraints_mutex); |
8519 | 9024 | ||
8520 | return ret; | 9025 | return ret; |
8521 | } | 9026 | } |
8522 | #else | 9027 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8523 | static int sched_rt_global_constraints(void) | 9028 | static int sched_rt_global_constraints(void) |
8524 | { | 9029 | { |
8525 | unsigned long flags; | 9030 | unsigned long flags; |
8526 | int i; | 9031 | int i; |
8527 | 9032 | ||
9033 | if (sysctl_sched_rt_period <= 0) | ||
9034 | return -EINVAL; | ||
9035 | |||
8528 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 9036 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
8529 | for_each_possible_cpu(i) { | 9037 | for_each_possible_cpu(i) { |
8530 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 9038 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
@@ -8537,7 +9045,7 @@ static int sched_rt_global_constraints(void) | |||
8537 | 9045 | ||
8538 | return 0; | 9046 | return 0; |
8539 | } | 9047 | } |
8540 | #endif | 9048 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8541 | 9049 | ||
8542 | int sched_rt_handler(struct ctl_table *table, int write, | 9050 | int sched_rt_handler(struct ctl_table *table, int write, |
8543 | struct file *filp, void __user *buffer, size_t *lenp, | 9051 | struct file *filp, void __user *buffer, size_t *lenp, |
@@ -8585,7 +9093,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8585 | 9093 | ||
8586 | if (!cgrp->parent) { | 9094 | if (!cgrp->parent) { |
8587 | /* This is early initialization for the top cgroup */ | 9095 | /* This is early initialization for the top cgroup */ |
8588 | init_task_group.css.cgroup = cgrp; | ||
8589 | return &init_task_group.css; | 9096 | return &init_task_group.css; |
8590 | } | 9097 | } |
8591 | 9098 | ||
@@ -8594,9 +9101,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8594 | if (IS_ERR(tg)) | 9101 | if (IS_ERR(tg)) |
8595 | return ERR_PTR(-ENOMEM); | 9102 | return ERR_PTR(-ENOMEM); |
8596 | 9103 | ||
8597 | /* Bind the cgroup to task_group object we just created */ | ||
8598 | tg->css.cgroup = cgrp; | ||
8599 | |||
8600 | return &tg->css; | 9104 | return &tg->css; |
8601 | } | 9105 | } |
8602 | 9106 | ||
@@ -8645,7 +9149,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8645 | 9149 | ||
8646 | return (u64) tg->shares; | 9150 | return (u64) tg->shares; |
8647 | } | 9151 | } |
8648 | #endif | 9152 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8649 | 9153 | ||
8650 | #ifdef CONFIG_RT_GROUP_SCHED | 9154 | #ifdef CONFIG_RT_GROUP_SCHED |
8651 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 9155 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -8669,7 +9173,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
8669 | { | 9173 | { |
8670 | return sched_group_rt_period(cgroup_tg(cgrp)); | 9174 | return sched_group_rt_period(cgroup_tg(cgrp)); |
8671 | } | 9175 | } |
8672 | #endif | 9176 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8673 | 9177 | ||
8674 | static struct cftype cpu_files[] = { | 9178 | static struct cftype cpu_files[] = { |
8675 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9179 | #ifdef CONFIG_FAIR_GROUP_SCHED |