aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-15 07:46:29 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-15 07:46:29 -0400
commitb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (patch)
tree53ccb1c2c14751fe69cf93102e76e97021f6df07 /kernel/sched.c
parent4f962d4d65923d7b722192e729840cfb79af0a5a (diff)
parent278429cff8809958d25415ba0ed32b59866ab1a8 (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/kernel/Makefile include/asm-x86/pda.h
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1684
1 files changed, 1094 insertions, 590 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cdb50260dbf..d897a524e7d8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -198,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
198 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
199 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
200 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
201 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
202} 210}
203 211
204static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
205{ 213{
206 ktime_t now; 214 ktime_t now;
207 215
208 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
209 return; 217 return;
210 218
211 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -289,15 +297,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 297static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 298/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 300#endif /* CONFIG_FAIR_GROUP_SCHED */
293 301
294#ifdef CONFIG_RT_GROUP_SCHED 302#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 305#endif /* CONFIG_RT_GROUP_SCHED */
298#else 306#else /* !CONFIG_USER_SCHED */
299#define root_task_group init_task_group 307#define root_task_group init_task_group
300#endif 308#endif /* CONFIG_USER_SCHED */
301 309
302/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -307,9 +315,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 315#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 316#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 317# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 318#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 319# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 320#endif /* CONFIG_USER_SCHED */
313 321
314/* 322/*
315 * A weight of 0 or 1 can cause arithmetics problems. 323 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +371,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 371#else
364 372
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline struct task_group *task_group(struct task_struct *p)
375{
376 return NULL;
377}
366 378
367#endif /* CONFIG_GROUP_SCHED */ 379#endif /* CONFIG_GROUP_SCHED */
368 380
@@ -373,6 +385,7 @@ struct cfs_rq {
373 385
374 u64 exec_clock; 386 u64 exec_clock;
375 u64 min_vruntime; 387 u64 min_vruntime;
388 u64 pair_start;
376 389
377 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -401,6 +414,31 @@ struct cfs_rq {
401 */ 414 */
402 struct list_head leaf_cfs_rq_list; 415 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 416 struct task_group *tg; /* group that "owns" this runqueue */
417
418#ifdef CONFIG_SMP
419 /*
420 * the part of load.weight contributed by tasks
421 */
422 unsigned long task_weight;
423
424 /*
425 * h_load = weight * f(tg)
426 *
427 * Where f(tg) is the recursive weight fraction assigned to
428 * this group.
429 */
430 unsigned long h_load;
431
432 /*
433 * this cpu's part of tg->shares
434 */
435 unsigned long shares;
436
437 /*
438 * load.weight at the time we set shares
439 */
440 unsigned long rq_weight;
441#endif
404#endif 442#endif
405}; 443};
406 444
@@ -452,6 +490,9 @@ struct root_domain {
452 */ 490 */
453 cpumask_t rto_mask; 491 cpumask_t rto_mask;
454 atomic_t rto_count; 492 atomic_t rto_count;
493#ifdef CONFIG_SMP
494 struct cpupri cpupri;
495#endif
455}; 496};
456 497
457/* 498/*
@@ -526,14 +567,19 @@ struct rq {
526 int push_cpu; 567 int push_cpu;
527 /* cpu of this runqueue: */ 568 /* cpu of this runqueue: */
528 int cpu; 569 int cpu;
570 int online;
571
572 unsigned long avg_load_per_task;
529 573
530 struct task_struct *migration_thread; 574 struct task_struct *migration_thread;
531 struct list_head migration_queue; 575 struct list_head migration_queue;
532#endif 576#endif
533 577
534#ifdef CONFIG_SCHED_HRTICK 578#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags; 579#ifdef CONFIG_SMP
536 ktime_t hrtick_expire; 580 int hrtick_csd_pending;
581 struct call_single_data hrtick_csd;
582#endif
537 struct hrtimer hrtick_timer; 583 struct hrtimer hrtick_timer;
538#endif 584#endif
539 585
@@ -559,14 +605,13 @@ struct rq {
559 /* BKL stats */ 605 /* BKL stats */
560 unsigned int bkl_count; 606 unsigned int bkl_count;
561#endif 607#endif
562 struct lock_class_key rq_lock_key;
563}; 608};
564 609
565static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566 611
567static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
568{ 613{
569 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
570} 615}
571 616
572static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -607,6 +652,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 652# define const_debug static const
608#endif 653#endif
609 654
655/**
656 * runqueue_is_locked
657 *
658 * Returns true if the current cpu runqueue is locked.
659 * This interface allows printk to be called with the runqueue lock
660 * held and know whether or not it is OK to wake up the klogd.
661 */
662int runqueue_is_locked(void)
663{
664 int cpu = get_cpu();
665 struct rq *rq = cpu_rq(cpu);
666 int ret;
667
668 ret = spin_is_locked(&rq->lock);
669 put_cpu();
670 return ret;
671}
672
610/* 673/*
611 * Debugging: various feature bits 674 * Debugging: various feature bits
612 */ 675 */
@@ -749,6 +812,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 812const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 813
751/* 814/*
815 * ratelimit for updating the group shares.
816 * default: 0.25ms
817 */
818unsigned int sysctl_sched_shares_ratelimit = 250000;
819
820/*
752 * period over which we measure -rt task cpu usage in us. 821 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 822 * default: 1s
754 */ 823 */
@@ -769,88 +838,12 @@ static inline u64 global_rt_period(void)
769 838
770static inline u64 global_rt_runtime(void) 839static inline u64 global_rt_runtime(void)
771{ 840{
772 if (sysctl_sched_rt_period < 0) 841 if (sysctl_sched_rt_runtime < 0)
773 return RUNTIME_INF; 842 return RUNTIME_INF;
774 843
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 844 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 845}
777 846
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 847#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 848# define prepare_arch_switch(next) do { } while (0)
856#endif 849#endif
@@ -996,13 +989,6 @@ static struct rq *this_rq_lock(void)
996 return rq; 989 return rq;
997} 990}
998 991
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK 992#ifdef CONFIG_SCHED_HRTICK
1007/* 993/*
1008 * Use HR-timers to deliver accurate preemption points. 994 * Use HR-timers to deliver accurate preemption points.
@@ -1014,25 +1000,6 @@ static inline void resched_task(struct task_struct *p)
1014 * When we get rescheduled we reprogram the hrtick_timer outside of the 1000 * When we get rescheduled we reprogram the hrtick_timer outside of the
1015 * rq->lock. 1001 * rq->lock.
1016 */ 1002 */
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET, /* re-programm hrtick_timer */
1033 HRTICK_RESET, /* not a new slice */
1034 HRTICK_BLOCK, /* stop hrtick operations */
1035};
1036 1003
1037/* 1004/*
1038 * Use hrtick when: 1005 * Use hrtick when:
@@ -1043,40 +1010,11 @@ static inline int hrtick_enabled(struct rq *rq)
1043{ 1010{
1044 if (!sched_feat(HRTICK)) 1011 if (!sched_feat(HRTICK))
1045 return 0; 1012 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) 1013 if (!cpu_active(cpu_of(rq)))
1047 return 0; 1014 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer); 1015 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049} 1016}
1050 1017
1051/*
1052 * Called to set the hrtick timer state.
1053 *
1054 * called with rq->lock held and irqs disabled
1055 */
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060 /*
1061 * preempt at: now + delay
1062 */
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065 /*
1066 * indicate we need to program the timer
1067 */
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072 /*
1073 * New slices are called from the schedule path and don't need a
1074 * forced reschedule.
1075 */
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq) 1018static void hrtick_clear(struct rq *rq)
1081{ 1019{
1082 if (hrtimer_active(&rq->hrtick_timer)) 1020 if (hrtimer_active(&rq->hrtick_timer))
@@ -1084,32 +1022,6 @@ static void hrtick_clear(struct rq *rq)
1084} 1022}
1085 1023
1086/* 1024/*
1087 * Update the timer from the possible pending state.
1088 */
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112/*
1113 * High-resolution timer tick. 1025 * High-resolution timer tick.
1114 * Runs from hardirq context with interrupts disabled. 1026 * Runs from hardirq context with interrupts disabled.
1115 */ 1027 */
@@ -1128,27 +1040,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1128} 1040}
1129 1041
1130#ifdef CONFIG_SMP 1042#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu) 1043/*
1044 * called from hardirq (IPI) context
1045 */
1046static void __hrtick_start(void *arg)
1132{ 1047{
1133 struct rq *rq = cpu_rq(cpu); 1048 struct rq *rq = arg;
1134 unsigned long flags;
1135 1049
1136 spin_lock_irqsave(&rq->lock, flags); 1050 spin_lock(&rq->lock);
1137 rq->hrtick_flags = 0; 1051 hrtimer_restart(&rq->hrtick_timer);
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1052 rq->hrtick_csd_pending = 0;
1139 spin_unlock_irqrestore(&rq->lock, flags); 1053 spin_unlock(&rq->lock);
1140
1141 hrtick_clear(rq);
1142} 1054}
1143 1055
1144static void hotplug_hrtick_enable(int cpu) 1056/*
1057 * Called to set the hrtick timer state.
1058 *
1059 * called with rq->lock held and irqs disabled
1060 */
1061static void hrtick_start(struct rq *rq, u64 delay)
1145{ 1062{
1146 struct rq *rq = cpu_rq(cpu); 1063 struct hrtimer *timer = &rq->hrtick_timer;
1147 unsigned long flags; 1064 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1148 1065
1149 spin_lock_irqsave(&rq->lock, flags); 1066 timer->expires = time;
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1067
1151 spin_unlock_irqrestore(&rq->lock, flags); 1068 if (rq == this_rq()) {
1069 hrtimer_restart(timer);
1070 } else if (!rq->hrtick_csd_pending) {
1071 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1072 rq->hrtick_csd_pending = 1;
1073 }
1152} 1074}
1153 1075
1154static int 1076static int
@@ -1163,70 +1085,60 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1163 case CPU_DOWN_PREPARE_FROZEN: 1085 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD: 1086 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN: 1087 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu); 1088 hrtick_clear(cpu_rq(cpu));
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK; 1089 return NOTIFY_OK;
1177 } 1090 }
1178 1091
1179 return NOTIFY_DONE; 1092 return NOTIFY_DONE;
1180} 1093}
1181 1094
1182static void init_hrtick(void) 1095static __init void init_hrtick(void)
1183{ 1096{
1184 hotcpu_notifier(hotplug_hrtick, 0); 1097 hotcpu_notifier(hotplug_hrtick, 0);
1185} 1098}
1186#endif /* CONFIG_SMP */ 1099#else
1100/*
1101 * Called to set the hrtick timer state.
1102 *
1103 * called with rq->lock held and irqs disabled
1104 */
1105static void hrtick_start(struct rq *rq, u64 delay)
1106{
1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1108}
1187 1109
1188static void init_rq_hrtick(struct rq *rq) 1110static inline void init_hrtick(void)
1189{ 1111{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194} 1112}
1113#endif /* CONFIG_SMP */
1195 1114
1196void hrtick_resched(void) 1115static void init_rq_hrtick(struct rq *rq)
1197{ 1116{
1198 struct rq *rq; 1117#ifdef CONFIG_SMP
1199 unsigned long flags; 1118 rq->hrtick_csd_pending = 0;
1200 1119
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED)) 1120 rq->hrtick_csd.flags = 0;
1202 return; 1121 rq->hrtick_csd.func = __hrtick_start;
1122 rq->hrtick_csd.info = rq;
1123#endif
1203 1124
1204 local_irq_save(flags); 1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1205 rq = cpu_rq(smp_processor_id()); 1126 rq->hrtick_timer.function = hrtick;
1206 hrtick_set(rq); 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1207 local_irq_restore(flags);
1208} 1128}
1209#else 1129#else /* CONFIG_SCHED_HRTICK */
1210static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1211{ 1131{
1212} 1132}
1213 1133
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq) 1134static inline void init_rq_hrtick(struct rq *rq)
1219{ 1135{
1220} 1136}
1221 1137
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1227{ 1139{
1228} 1140}
1229#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1230 1142
1231/* 1143/*
1232 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1241,16 +1153,16 @@ static inline void init_hrtick(void)
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1153#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif 1154#endif
1243 1155
1244static void __resched_task(struct task_struct *p, int tif_bit) 1156static void resched_task(struct task_struct *p)
1245{ 1157{
1246 int cpu; 1158 int cpu;
1247 1159
1248 assert_spin_locked(&task_rq(p)->lock); 1160 assert_spin_locked(&task_rq(p)->lock);
1249 1161
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit))) 1162 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1251 return; 1163 return;
1252 1164
1253 set_tsk_thread_flag(p, tif_bit); 1165 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1254 1166
1255 cpu = task_cpu(p); 1167 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id()) 1168 if (cpu == smp_processor_id())
@@ -1313,15 +1225,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1225 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1226 smp_send_reschedule(cpu);
1315} 1227}
1316#endif 1228#endif /* CONFIG_NO_HZ */
1317 1229
1318#else 1230#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1231static void resched_task(struct task_struct *p)
1320{ 1232{
1321 assert_spin_locked(&task_rq(p)->lock); 1233 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1234 set_tsk_need_resched(p);
1323} 1235}
1324#endif 1236#endif /* CONFIG_SMP */
1325 1237
1326#if BITS_PER_LONG == 32 1238#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1239# define WMULT_CONST (~0UL)
@@ -1336,6 +1248,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1248 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1249#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1250
1251/*
1252 * delta *= weight / lw
1253 */
1339static unsigned long 1254static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1255calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1256 struct load_weight *lw)
@@ -1363,12 +1278,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1278 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1279}
1365 1280
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1281static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1282{
1374 lw->weight += inc; 1283 lw->weight += inc;
@@ -1476,20 +1385,227 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1476 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1477} 1386}
1478 1387
1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1389typedef int (*tg_visitor)(struct task_group *, void *);
1390
1391/*
1392 * Iterate the full tree, calling @down when first entering a node and @up when
1393 * leaving it for the final time.
1394 */
1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1396{
1397 struct task_group *parent, *child;
1398 int ret;
1399
1400 rcu_read_lock();
1401 parent = &root_task_group;
1402down:
1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1407 parent = child;
1408 goto down;
1409
1410up:
1411 continue;
1412 }
1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1416
1417 child = parent;
1418 parent = parent->parent;
1419 if (parent)
1420 goto up;
1421out_unlock:
1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1430}
1431#endif
1432
1479#ifdef CONFIG_SMP 1433#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1434static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1435static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1485 1447
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1448#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1449
1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1451
1452/*
1453 * Calculate and set the cpu's group shares.
1454 */
1455static void
1456__update_group_shares_cpu(struct task_group *tg, int cpu,
1457 unsigned long sd_shares, unsigned long sd_rq_weight)
1458{
1459 int boost = 0;
1460 unsigned long shares;
1461 unsigned long rq_weight;
1462
1463 if (!tg->se[cpu])
1464 return;
1465
1466 rq_weight = tg->cfs_rq[cpu]->load.weight;
1467
1468 /*
1469 * If there are currently no tasks on the cpu pretend there is one of
1470 * average load so that when a new task gets to run here it will not
1471 * get delayed by group starvation.
1472 */
1473 if (!rq_weight) {
1474 boost = 1;
1475 rq_weight = NICE_0_LOAD;
1476 }
1477
1478 if (unlikely(rq_weight > sd_rq_weight))
1479 rq_weight = sd_rq_weight;
1480
1481 /*
1482 * \Sum shares * rq_weight
1483 * shares = -----------------------
1484 * \Sum rq_weight
1485 *
1486 */
1487 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1488
1489 /*
1490 * record the actual number of shares, not the boosted amount.
1491 */
1492 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1493 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1494
1495 if (shares < MIN_SHARES)
1496 shares = MIN_SHARES;
1497 else if (shares > MAX_SHARES)
1498 shares = MAX_SHARES;
1499
1500 __set_se_shares(tg->se[cpu], shares);
1501}
1502
1503/*
1504 * Re-compute the task group their per cpu shares over the given domain.
1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1506 * parent group depends on the shares of its child groups.
1507 */
1508static int tg_shares_up(struct task_group *tg, void *data)
1509{
1510 unsigned long rq_weight = 0;
1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1513 int i;
1514
1515 for_each_cpu_mask(i, sd->span) {
1516 rq_weight += tg->cfs_rq[i]->load.weight;
1517 shares += tg->cfs_rq[i]->shares;
1518 }
1519
1520 if ((!shares && rq_weight) || shares > tg->shares)
1521 shares = tg->shares;
1522
1523 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1524 shares = tg->shares;
1525
1526 if (!rq_weight)
1527 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1528
1529 for_each_cpu_mask(i, sd->span) {
1530 struct rq *rq = cpu_rq(i);
1531 unsigned long flags;
1532
1533 spin_lock_irqsave(&rq->lock, flags);
1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1535 spin_unlock_irqrestore(&rq->lock, flags);
1536 }
1537
1538 return 0;
1539}
1540
1541/*
1542 * Compute the cpu's hierarchical load factor for each task group.
1543 * This needs to be done in a top-down fashion because the load of a child
1544 * group is a fraction of its parents load.
1545 */
1546static int tg_load_down(struct task_group *tg, void *data)
1547{
1548 unsigned long load;
1549 long cpu = (long)data;
1550
1551 if (!tg->parent) {
1552 load = cpu_rq(cpu)->load.weight;
1553 } else {
1554 load = tg->parent->cfs_rq[cpu]->h_load;
1555 load *= tg->cfs_rq[cpu]->shares;
1556 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1557 }
1558
1559 tg->cfs_rq[cpu]->h_load = load;
1560
1561 return 0;
1562}
1563
1564static void update_shares(struct sched_domain *sd)
1565{
1566 u64 now = cpu_clock(raw_smp_processor_id());
1567 s64 elapsed = now - sd->last_update;
1568
1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1570 sd->last_update = now;
1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1572 }
1573}
1574
1575static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1576{
1577 spin_unlock(&rq->lock);
1578 update_shares(sd);
1579 spin_lock(&rq->lock);
1580}
1581
1582static void update_h_load(long cpu)
1583{
1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1585}
1586
1587#else
1588
1589static inline void update_shares(struct sched_domain *sd)
1590{
1591}
1592
1593static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1488{ 1594{
1489} 1595}
1596
1490#endif 1597#endif
1491 1598
1492#endif /* CONFIG_SMP */ 1599#endif
1600
1601#ifdef CONFIG_FAIR_GROUP_SCHED
1602static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1603{
1604#ifdef CONFIG_SMP
1605 cfs_rq->shares = shares;
1606#endif
1607}
1608#endif
1493 1609
1494#include "sched_stats.h" 1610#include "sched_stats.h"
1495#include "sched_idletask.c" 1611#include "sched_idletask.c"
@@ -1500,27 +1616,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1616#endif
1501 1617
1502#define sched_class_highest (&rt_sched_class) 1618#define sched_class_highest (&rt_sched_class)
1619#define for_each_class(class) \
1620 for (class = sched_class_highest; class; class = class->next)
1503 1621
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1622static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1623{
1516 rq->nr_running++; 1624 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1625}
1519 1626
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1627static void dec_nr_running(struct rq *rq)
1521{ 1628{
1522 rq->nr_running--; 1629 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1630}
1525 1631
1526static void set_load_weight(struct task_struct *p) 1632static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1650,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1650 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1651}
1546 1652
1653static void update_avg(u64 *avg, u64 sample)
1654{
1655 s64 diff = sample - *avg;
1656 *avg += diff >> 3;
1657}
1658
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1659static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1660{
1549 sched_info_queued(p); 1661 sched_info_queued(p);
@@ -1553,6 +1665,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1665
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1666static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1667{
1668 if (sleep && p->se.last_wakeup) {
1669 update_avg(&p->se.avg_overlap,
1670 p->se.sum_exec_runtime - p->se.last_wakeup);
1671 p->se.last_wakeup = 0;
1672 }
1673
1674 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1675 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1676 p->se.on_rq = 0;
1558} 1677}
@@ -1612,7 +1731,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1731 rq->nr_uninterruptible--;
1613 1732
1614 enqueue_task(rq, p, wakeup); 1733 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1734 inc_nr_running(rq);
1616} 1735}
1617 1736
1618/* 1737/*
@@ -1624,7 +1743,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1743 rq->nr_uninterruptible++;
1625 1744
1626 dequeue_task(rq, p, sleep); 1745 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1746 dec_nr_running(rq);
1628} 1747}
1629 1748
1630/** 1749/**
@@ -1636,12 +1755,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1755 return cpu_curr(task_cpu(p)) == p;
1637} 1756}
1638 1757
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1758static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1759{
1647 set_task_rq(p, cpu); 1760 set_task_rq(p, cpu);
@@ -1670,6 +1783,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1783
1671#ifdef CONFIG_SMP 1784#ifdef CONFIG_SMP
1672 1785
1786/* Used instead of source_load when we know the type == 0 */
1787static unsigned long weighted_cpuload(const int cpu)
1788{
1789 return cpu_rq(cpu)->load.weight;
1790}
1791
1673/* 1792/*
1674 * Is this task likely cache-hot: 1793 * Is this task likely cache-hot:
1675 */ 1794 */
@@ -1765,16 +1884,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1765/* 1884/*
1766 * wait_task_inactive - wait for a thread to unschedule. 1885 * wait_task_inactive - wait for a thread to unschedule.
1767 * 1886 *
1887 * If @match_state is nonzero, it's the @p->state value just checked and
1888 * not expected to change. If it changes, i.e. @p might have woken up,
1889 * then return zero. When we succeed in waiting for @p to be off its CPU,
1890 * we return a positive number (its total switch count). If a second call
1891 * a short while later returns the same number, the caller can be sure that
1892 * @p has remained unscheduled the whole time.
1893 *
1768 * The caller must ensure that the task *will* unschedule sometime soon, 1894 * The caller must ensure that the task *will* unschedule sometime soon,
1769 * else this function might spin for a *long* time. This function can't 1895 * else this function might spin for a *long* time. This function can't
1770 * be called with interrupts off, or it may introduce deadlock with 1896 * be called with interrupts off, or it may introduce deadlock with
1771 * smp_call_function() if an IPI is sent by the same process we are 1897 * smp_call_function() if an IPI is sent by the same process we are
1772 * waiting to become inactive. 1898 * waiting to become inactive.
1773 */ 1899 */
1774void wait_task_inactive(struct task_struct *p) 1900unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1775{ 1901{
1776 unsigned long flags; 1902 unsigned long flags;
1777 int running, on_rq; 1903 int running, on_rq;
1904 unsigned long ncsw;
1778 struct rq *rq; 1905 struct rq *rq;
1779 1906
1780 for (;;) { 1907 for (;;) {
@@ -1797,8 +1924,11 @@ void wait_task_inactive(struct task_struct *p)
1797 * return false if the runqueue has changed and p 1924 * return false if the runqueue has changed and p
1798 * is actually now running somewhere else! 1925 * is actually now running somewhere else!
1799 */ 1926 */
1800 while (task_running(rq, p)) 1927 while (task_running(rq, p)) {
1928 if (match_state && unlikely(p->state != match_state))
1929 return 0;
1801 cpu_relax(); 1930 cpu_relax();
1931 }
1802 1932
1803 /* 1933 /*
1804 * Ok, time to look more closely! We need the rq 1934 * Ok, time to look more closely! We need the rq
@@ -1808,9 +1938,18 @@ void wait_task_inactive(struct task_struct *p)
1808 rq = task_rq_lock(p, &flags); 1938 rq = task_rq_lock(p, &flags);
1809 running = task_running(rq, p); 1939 running = task_running(rq, p);
1810 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1941 ncsw = 0;
1942 if (!match_state || p->state == match_state)
1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1811 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1812 1945
1813 /* 1946 /*
1947 * If it changed from the expected state, bail out now.
1948 */
1949 if (unlikely(!ncsw))
1950 break;
1951
1952 /*
1814 * Was it really running after all now that we 1953 * Was it really running after all now that we
1815 * checked with the proper locks actually held? 1954 * checked with the proper locks actually held?
1816 * 1955 *
@@ -1842,6 +1981,8 @@ void wait_task_inactive(struct task_struct *p)
1842 */ 1981 */
1843 break; 1982 break;
1844 } 1983 }
1984
1985 return ncsw;
1845} 1986}
1846 1987
1847/*** 1988/***
@@ -1880,7 +2021,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 2021 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 2022 unsigned long total = weighted_cpuload(cpu);
1882 2023
1883 if (type == 0) 2024 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 2025 return total;
1885 2026
1886 return min(rq->cpu_load[type-1], total); 2027 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2036,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 2036 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 2037 unsigned long total = weighted_cpuload(cpu);
1897 2038
1898 if (type == 0) 2039 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2040 return total;
1900 2041
1901 return max(rq->cpu_load[type-1], total); 2042 return max(rq->cpu_load[type-1], total);
1902} 2043}
1903 2044
1904/* 2045/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2046 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2047 * domain.
1919 */ 2048 */
@@ -1939,7 +2068,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1939 /* Tally up the load of all CPUs in the group */ 2068 /* Tally up the load of all CPUs in the group */
1940 avg_load = 0; 2069 avg_load = 0;
1941 2070
1942 for_each_cpu_mask(i, group->cpumask) { 2071 for_each_cpu_mask_nr(i, group->cpumask) {
1943 /* Bias balancing toward cpus of our domain */ 2072 /* Bias balancing toward cpus of our domain */
1944 if (local_group) 2073 if (local_group)
1945 load = source_load(i, load_idx); 2074 load = source_load(i, load_idx);
@@ -1981,7 +2110,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1981 /* Traverse only the allowed CPUs */ 2110 /* Traverse only the allowed CPUs */
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2111 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983 2112
1984 for_each_cpu_mask(i, *tmp) { 2113 for_each_cpu_mask_nr(i, *tmp) {
1985 load = weighted_cpuload(i); 2114 load = weighted_cpuload(i);
1986 2115
1987 if (load < min_load || (load == min_load && i == this_cpu)) { 2116 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2019,6 +2148,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2148 sd = tmp;
2020 } 2149 }
2021 2150
2151 if (sd)
2152 update_shares(sd);
2153
2022 while (sd) { 2154 while (sd) {
2023 cpumask_t span, tmpmask; 2155 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2156 struct sched_group *group;
@@ -2085,6 +2217,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2217 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2218 sync = 0;
2087 2219
2220#ifdef CONFIG_SMP
2221 if (sched_feat(LB_WAKEUP_UPDATE)) {
2222 struct sched_domain *sd;
2223
2224 this_cpu = raw_smp_processor_id();
2225 cpu = task_cpu(p);
2226
2227 for_each_domain(this_cpu, sd) {
2228 if (cpu_isset(cpu, sd->span)) {
2229 update_shares(sd);
2230 break;
2231 }
2232 }
2233 }
2234#endif
2235
2088 smp_wmb(); 2236 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2237 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2238 old_state = p->state;
@@ -2131,7 +2279,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2279 }
2132 } 2280 }
2133 } 2281 }
2134#endif 2282#endif /* CONFIG_SCHEDSTATS */
2135 2283
2136out_activate: 2284out_activate:
2137#endif /* CONFIG_SMP */ 2285#endif /* CONFIG_SMP */
@@ -2149,7 +2297,10 @@ out_activate:
2149 success = 1; 2297 success = 1;
2150 2298
2151out_running: 2299out_running:
2152 check_preempt_curr(rq, p); 2300 trace_mark(kernel_sched_wakeup,
2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2302 p->pid, p->state, rq, p, rq->curr);
2303 check_preempt_curr(rq, p, sync);
2153 2304
2154 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2155#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2157,6 +2308,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2308 p->sched_class->task_wake_up(rq, p);
2158#endif 2309#endif
2159out: 2310out:
2311 current->se.last_wakeup = current->se.sum_exec_runtime;
2312
2160 task_rq_unlock(rq, &flags); 2313 task_rq_unlock(rq, &flags);
2161 2314
2162 return success; 2315 return success;
@@ -2277,9 +2430,12 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2430 * management (if any):
2278 */ 2431 */
2279 p->sched_class->task_new(rq, p); 2432 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2433 inc_nr_running(rq);
2281 } 2434 }
2282 check_preempt_curr(rq, p); 2435 trace_mark(kernel_sched_wakeup_new,
2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2437 p->pid, p->state, rq, p, rq->curr);
2438 check_preempt_curr(rq, p, 0);
2283#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2285 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2331,7 +2487,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2487 notifier->ops->sched_out(notifier, next);
2332} 2488}
2333 2489
2334#else 2490#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2491
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2492static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2493{
@@ -2343,7 +2499,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2499{
2344} 2500}
2345 2501
2346#endif 2502#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2503
2348/** 2504/**
2349 * prepare_task_switch - prepare to switch tasks 2505 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2607,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2451 struct mm_struct *mm, *oldmm; 2607 struct mm_struct *mm, *oldmm;
2452 2608
2453 prepare_task_switch(rq, prev, next); 2609 prepare_task_switch(rq, prev, next);
2610 trace_mark(kernel_sched_schedule,
2611 "prev_pid %d next_pid %d prev_state %ld "
2612 "## rq %p prev %p next %p",
2613 prev->pid, next->pid, prev->state,
2614 rq, prev, next);
2454 mm = next->mm; 2615 mm = next->mm;
2455 oldmm = prev->active_mm; 2616 oldmm = prev->active_mm;
2456 /* 2617 /*
@@ -2612,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2612 } else { 2773 } else {
2613 if (rq1 < rq2) { 2774 if (rq1 < rq2) {
2614 spin_lock(&rq1->lock); 2775 spin_lock(&rq1->lock);
2615 spin_lock(&rq2->lock); 2776 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2616 } else { 2777 } else {
2617 spin_lock(&rq2->lock); 2778 spin_lock(&rq2->lock);
2618 spin_lock(&rq1->lock); 2779 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2619 } 2780 }
2620 } 2781 }
2621 update_rq_clock(rq1); 2782 update_rq_clock(rq1);
@@ -2658,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2658 if (busiest < this_rq) { 2819 if (busiest < this_rq) {
2659 spin_unlock(&this_rq->lock); 2820 spin_unlock(&this_rq->lock);
2660 spin_lock(&busiest->lock); 2821 spin_lock(&busiest->lock);
2661 spin_lock(&this_rq->lock); 2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2662 ret = 1; 2823 ret = 1;
2663 } else 2824 } else
2664 spin_lock(&busiest->lock); 2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2665 } 2826 }
2666 return ret; 2827 return ret;
2667} 2828}
2668 2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2669/* 2837/*
2670 * If dest_cpu is allowed for this process, migrate the task to it. 2838 * If dest_cpu is allowed for this process, migrate the task to it.
2671 * This is accomplished by forcing the cpu_allowed mask to only 2839 * This is accomplished by forcing the cpu_allowed mask to only
@@ -2680,7 +2848,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2680 2848
2681 rq = task_rq_lock(p, &flags); 2849 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2850 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu))) 2851 || unlikely(!cpu_active(dest_cpu)))
2684 goto out; 2852 goto out;
2685 2853
2686 /* force the process onto the specified CPU */ 2854 /* force the process onto the specified CPU */
@@ -2727,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2727 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2728 * to be always true for them. 2896 * to be always true for them.
2729 */ 2897 */
2730 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2731} 2899}
2732 2900
2733/* 2901/*
@@ -2785,7 +2953,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2953 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2954 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2955{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2956 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2957 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2958 long rem_load_move = max_load_move;
2791 2959
@@ -2801,14 +2969,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 2969next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 2970 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 2971 goto out;
2804 /* 2972
2805 * To help distribute high priority tasks across CPUs we don't 2973 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2974 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 2975 p = iterator->next(iterator->arg);
2814 goto next; 2976 goto next;
@@ -2863,6 +3025,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 3025 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 3026 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 3027 class = class->next;
3028
3029 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3030 break;
3031
2866 } while (class && max_load_move > total_load_moved); 3032 } while (class && max_load_move > total_load_moved);
2867 3033
2868 return total_load_moved > 0; 3034 return total_load_moved > 0;
@@ -2939,6 +3105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3105 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3106 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3107 this_load_per_task = this_nr_running = 0;
3108
2942 if (idle == CPU_NOT_IDLE) 3109 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3110 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3111 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3120,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3120 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3121 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3122 unsigned long sum_nr_running, sum_weighted_load;
3123 unsigned long sum_avg_load_per_task;
3124 unsigned long avg_load_per_task;
2956 3125
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3126 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3127
@@ -2961,10 +3130,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3130
2962 /* Tally up the load of all CPUs in the group */ 3131 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3132 sum_weighted_load = sum_nr_running = avg_load = 0;
3133 sum_avg_load_per_task = avg_load_per_task = 0;
3134
2964 max_cpu_load = 0; 3135 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3136 min_cpu_load = ~0UL;
2966 3137
2967 for_each_cpu_mask(i, group->cpumask) { 3138 for_each_cpu_mask_nr(i, group->cpumask) {
2968 struct rq *rq; 3139 struct rq *rq;
2969 3140
2970 if (!cpu_isset(i, *cpus)) 3141 if (!cpu_isset(i, *cpus))
@@ -2994,6 +3165,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3165 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3166 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3167 sum_weighted_load += weighted_cpuload(i);
3168
3169 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3170 }
2998 3171
2999 /* 3172 /*
@@ -3015,7 +3188,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3188 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3189 avg_load * SCHED_LOAD_SCALE);
3017 3190
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3191
3192 /*
3193 * Consider the group unbalanced when the imbalance is larger
3194 * than the average weight of two tasks.
3195 *
3196 * APZ: with cgroup the avg task weight can vary wildly and
3197 * might not be a suitable number - should we keep a
3198 * normalized nr_running number somewhere that negates
3199 * the hierarchy?
3200 */
3201 avg_load_per_task = sg_div_cpu_power(group,
3202 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3203
3204 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3205 __group_imb = 1;
3020 3206
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3207 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3342,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3342 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3343 imbn = 1;
3158 } else 3344 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3345 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3346
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3347 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3348 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3349 *imbalance = busiest_load_per_task;
3164 return busiest; 3350 return busiest;
@@ -3228,7 +3414,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3228 unsigned long max_load = 0; 3414 unsigned long max_load = 0;
3229 int i; 3415 int i;
3230 3416
3231 for_each_cpu_mask(i, group->cpumask) { 3417 for_each_cpu_mask_nr(i, group->cpumask) {
3232 unsigned long wl; 3418 unsigned long wl;
3233 3419
3234 if (!cpu_isset(i, *cpus)) 3420 if (!cpu_isset(i, *cpus))
@@ -3284,6 +3470,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3470 schedstat_inc(sd, lb_count[idle]);
3285 3471
3286redo: 3472redo:
3473 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3474 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3475 cpus, balance);
3289 3476
@@ -3386,8 +3573,9 @@ redo:
3386 3573
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3574 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3575 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3576 ld_moved = -1;
3390 return ld_moved; 3577
3578 goto out;
3391 3579
3392out_balanced: 3580out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3581 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3590,13 @@ out_one_pinned:
3402 3590
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3591 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3592 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3593 ld_moved = -1;
3406 return 0; 3594 else
3595 ld_moved = 0;
3596out:
3597 if (ld_moved)
3598 update_shares(sd);
3599 return ld_moved;
3407} 3600}
3408 3601
3409/* 3602/*
@@ -3438,6 +3631,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3631
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3632 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3633redo:
3634 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3635 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3636 &sd_idle, cpus, NULL);
3443 if (!group) { 3637 if (!group) {
@@ -3464,7 +3658,7 @@ redo:
3464 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3658 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3465 imbalance, sd, CPU_NEWLY_IDLE, 3659 imbalance, sd, CPU_NEWLY_IDLE,
3466 &all_pinned); 3660 &all_pinned);
3467 spin_unlock(&busiest->lock); 3661 double_unlock_balance(this_rq, busiest);
3468 3662
3469 if (unlikely(all_pinned)) { 3663 if (unlikely(all_pinned)) {
3470 cpu_clear(cpu_of(busiest), *cpus); 3664 cpu_clear(cpu_of(busiest), *cpus);
@@ -3481,6 +3675,7 @@ redo:
3481 } else 3675 } else
3482 sd->nr_balance_failed = 0; 3676 sd->nr_balance_failed = 0;
3483 3677
3678 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3679 return ld_moved;
3485 3680
3486out_balanced: 3681out_balanced:
@@ -3578,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3578 else 3773 else
3579 schedstat_inc(sd, alb_failed); 3774 schedstat_inc(sd, alb_failed);
3580 } 3775 }
3581 spin_unlock(&target_rq->lock); 3776 double_unlock_balance(busiest_rq, target_rq);
3582} 3777}
3583 3778
3584#ifdef CONFIG_NO_HZ 3779#ifdef CONFIG_NO_HZ
@@ -3621,7 +3816,7 @@ int select_nohz_load_balancer(int stop_tick)
3621 /* 3816 /*
3622 * If we are going offline and still the leader, give up! 3817 * If we are going offline and still the leader, give up!
3623 */ 3818 */
3624 if (cpu_is_offline(cpu) && 3819 if (!cpu_active(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) { 3820 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3821 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG(); 3822 BUG();
@@ -3672,6 +3867,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3867 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3868 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3869 int update_next_balance = 0;
3870 int need_serialize;
3675 cpumask_t tmp; 3871 cpumask_t tmp;
3676 3872
3677 for_each_domain(cpu, sd) { 3873 for_each_domain(cpu, sd) {
@@ -3689,8 +3885,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3885 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3886 interval = HZ*NR_CPUS/10;
3691 3887
3888 need_serialize = sd->flags & SD_SERIALIZE;
3692 3889
3693 if (sd->flags & SD_SERIALIZE) { 3890 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3891 if (!spin_trylock(&balancing))
3695 goto out; 3892 goto out;
3696 } 3893 }
@@ -3706,7 +3903,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3903 }
3707 sd->last_balance = jiffies; 3904 sd->last_balance = jiffies;
3708 } 3905 }
3709 if (sd->flags & SD_SERIALIZE) 3906 if (need_serialize)
3710 spin_unlock(&balancing); 3907 spin_unlock(&balancing);
3711out: 3908out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3909 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3759,7 +3956,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3759 int balance_cpu; 3956 int balance_cpu;
3760 3957
3761 cpu_clear(this_cpu, cpus); 3958 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) { 3959 for_each_cpu_mask_nr(balance_cpu, cpus) {
3763 /* 3960 /*
3764 * If this cpu gets work to do, stop the load balancing 3961 * If this cpu gets work to do, stop the load balancing
3765 * work being done for other cpus. Next load 3962 * work being done for other cpus. Next load
@@ -3895,6 +4092,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3895 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4092 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3896 else 4093 else
3897 cpustat->user = cputime64_add(cpustat->user, tmp); 4094 cpustat->user = cputime64_add(cpustat->user, tmp);
4095 /* Account for user time used */
4096 acct_update_integrals(p);
3898} 4097}
3899 4098
3900/* 4099/*
@@ -3995,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3995} 4194}
3996 4195
3997/* 4196/*
4197 * Use precise platform statistics if available:
4198 */
4199#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4200cputime_t task_utime(struct task_struct *p)
4201{
4202 return p->utime;
4203}
4204
4205cputime_t task_stime(struct task_struct *p)
4206{
4207 return p->stime;
4208}
4209#else
4210cputime_t task_utime(struct task_struct *p)
4211{
4212 clock_t utime = cputime_to_clock_t(p->utime),
4213 total = utime + cputime_to_clock_t(p->stime);
4214 u64 temp;
4215
4216 /*
4217 * Use CFS's precise accounting:
4218 */
4219 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4220
4221 if (total) {
4222 temp *= utime;
4223 do_div(temp, total);
4224 }
4225 utime = (clock_t)temp;
4226
4227 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4228 return p->prev_utime;
4229}
4230
4231cputime_t task_stime(struct task_struct *p)
4232{
4233 clock_t stime;
4234
4235 /*
4236 * Use CFS's precise accounting. (we subtract utime from
4237 * the total, to make sure the total observed by userspace
4238 * grows monotonically - apps rely on that):
4239 */
4240 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4241 cputime_to_clock_t(task_utime(p));
4242
4243 if (stime >= 0)
4244 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4245
4246 return p->prev_stime;
4247}
4248#endif
4249
4250inline cputime_t task_gtime(struct task_struct *p)
4251{
4252 return p->gtime;
4253}
4254
4255/*
3998 * This function gets called by the timer code, with HZ frequency. 4256 * This function gets called by the timer code, with HZ frequency.
3999 * We call it with interrupts disabled. 4257 * We call it with interrupts disabled.
4000 * 4258 *
@@ -4021,26 +4279,44 @@ void scheduler_tick(void)
4021#endif 4279#endif
4022} 4280}
4023 4281
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4282#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4283 defined(CONFIG_PREEMPT_TRACER))
4284
4285static inline unsigned long get_parent_ip(unsigned long addr)
4286{
4287 if (in_lock_functions(addr)) {
4288 addr = CALLER_ADDR2;
4289 if (in_lock_functions(addr))
4290 addr = CALLER_ADDR3;
4291 }
4292 return addr;
4293}
4025 4294
4026void __kprobes add_preempt_count(int val) 4295void __kprobes add_preempt_count(int val)
4027{ 4296{
4297#ifdef CONFIG_DEBUG_PREEMPT
4028 /* 4298 /*
4029 * Underflow? 4299 * Underflow?
4030 */ 4300 */
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4301 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return; 4302 return;
4303#endif
4033 preempt_count() += val; 4304 preempt_count() += val;
4305#ifdef CONFIG_DEBUG_PREEMPT
4034 /* 4306 /*
4035 * Spinlock count overflowing soon? 4307 * Spinlock count overflowing soon?
4036 */ 4308 */
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4309 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10); 4310 PREEMPT_MASK - 10);
4311#endif
4312 if (preempt_count() == val)
4313 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4039} 4314}
4040EXPORT_SYMBOL(add_preempt_count); 4315EXPORT_SYMBOL(add_preempt_count);
4041 4316
4042void __kprobes sub_preempt_count(int val) 4317void __kprobes sub_preempt_count(int val)
4043{ 4318{
4319#ifdef CONFIG_DEBUG_PREEMPT
4044 /* 4320 /*
4045 * Underflow? 4321 * Underflow?
4046 */ 4322 */
@@ -4052,7 +4328,10 @@ void __kprobes sub_preempt_count(int val)
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4328 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK))) 4329 !(preempt_count() & PREEMPT_MASK)))
4054 return; 4330 return;
4331#endif
4055 4332
4333 if (preempt_count() == val)
4334 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4056 preempt_count() -= val; 4335 preempt_count() -= val;
4057} 4336}
4058EXPORT_SYMBOL(sub_preempt_count); 4337EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4349,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4349 prev->comm, prev->pid, preempt_count());
4071 4350
4072 debug_show_held_locks(prev); 4351 debug_show_held_locks(prev);
4352 print_modules();
4073 if (irqs_disabled()) 4353 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4354 print_irqtrace_events(prev);
4075 4355
@@ -4158,7 +4438,8 @@ need_resched_nonpreemptible:
4158 4438
4159 schedule_debug(prev); 4439 schedule_debug(prev);
4160 4440
4161 hrtick_clear(rq); 4441 if (sched_feat(HRTICK))
4442 hrtick_clear(rq);
4162 4443
4163 /* 4444 /*
4164 * Do the rq-clock update outside the rq lock: 4445 * Do the rq-clock update outside the rq lock:
@@ -4204,8 +4485,6 @@ need_resched_nonpreemptible:
4204 } else 4485 } else
4205 spin_unlock_irq(&rq->lock); 4486 spin_unlock_irq(&rq->lock);
4206 4487
4207 hrtick_set(rq);
4208
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4488 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4489 goto need_resched_nonpreemptible;
4211 4490
@@ -4363,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4363} 4642}
4364EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4365 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4366void complete(struct completion *x) 4654void complete(struct completion *x)
4367{ 4655{
4368 unsigned long flags; 4656 unsigned long flags;
@@ -4374,6 +4662,12 @@ void complete(struct completion *x)
4374} 4662}
4375EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4376 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4377void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4378{ 4672{
4379 unsigned long flags; 4673 unsigned long flags;
@@ -4394,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4394 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4395 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4396 do { 4690 do {
4397 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4398 signal_pending(current)) ||
4399 (state == TASK_KILLABLE &&
4400 fatal_signal_pending(current))) {
4401 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4402 break; 4693 break;
4403 } 4694 }
@@ -4425,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4425 return timeout; 4716 return timeout;
4426} 4717}
4427 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4428void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4429{ 4730{
4430 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4431} 4732}
4432EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4433 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4434unsigned long __sched 4744unsigned long __sched
4435wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4436{ 4746{
@@ -4438,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4438} 4748}
4439EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4440 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4441int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4442{ 4759{
4443 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4447,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4447} 4764}
4448EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4449 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4450unsigned long __sched 4775unsigned long __sched
4451wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4452 unsigned long timeout) 4777 unsigned long timeout)
@@ -4455,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4455} 4780}
4456EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4457 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4458int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4459{ 4791{
4460 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4464,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4464} 4796}
4465EXPORT_SYMBOL(wait_for_completion_killable); 4797EXPORT_SYMBOL(wait_for_completion_killable);
4466 4798
4799/**
4800 * try_wait_for_completion - try to decrement a completion without blocking
4801 * @x: completion structure
4802 *
4803 * Returns: 0 if a decrement cannot be done without blocking
4804 * 1 if a decrement succeeded.
4805 *
4806 * If a completion is being used as a counting completion,
4807 * attempt to decrement the counter without blocking. This
4808 * enables us to avoid waiting if the resource the completion
4809 * is protecting is not available.
4810 */
4811bool try_wait_for_completion(struct completion *x)
4812{
4813 int ret = 1;
4814
4815 spin_lock_irq(&x->wait.lock);
4816 if (!x->done)
4817 ret = 0;
4818 else
4819 x->done--;
4820 spin_unlock_irq(&x->wait.lock);
4821 return ret;
4822}
4823EXPORT_SYMBOL(try_wait_for_completion);
4824
4825/**
4826 * completion_done - Test to see if a completion has any waiters
4827 * @x: completion structure
4828 *
4829 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4830 * 1 if there are no waiters.
4831 *
4832 */
4833bool completion_done(struct completion *x)
4834{
4835 int ret = 1;
4836
4837 spin_lock_irq(&x->wait.lock);
4838 if (!x->done)
4839 ret = 0;
4840 spin_unlock_irq(&x->wait.lock);
4841 return ret;
4842}
4843EXPORT_SYMBOL(completion_done);
4844
4467static long __sched 4845static long __sched
4468sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4846sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4469{ 4847{
@@ -4586,10 +4964,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4964 goto out_unlock;
4587 } 4965 }
4588 on_rq = p->se.on_rq; 4966 on_rq = p->se.on_rq;
4589 if (on_rq) { 4967 if (on_rq)
4590 dequeue_task(rq, p, 0); 4968 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4969
4594 p->static_prio = NICE_TO_PRIO(nice); 4970 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4971 set_load_weight(p);
@@ -4599,7 +4975,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4975
4600 if (on_rq) { 4976 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4977 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4978 /*
4604 * If the task increased its priority or is running and 4979 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4980 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +5119,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 5119 set_load_weight(p);
4745} 5120}
4746 5121
4747/** 5122static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 5123 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 5124{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 5125 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 5126 unsigned long flags;
@@ -4785,7 +5152,7 @@ recheck:
4785 /* 5152 /*
4786 * Allow unprivileged RT tasks to decrease priority: 5153 * Allow unprivileged RT tasks to decrease priority:
4787 */ 5154 */
4788 if (!capable(CAP_SYS_NICE)) { 5155 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 5156 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 5157 unsigned long rlim_rtprio;
4791 5158
@@ -4816,18 +5183,22 @@ recheck:
4816 return -EPERM; 5183 return -EPERM;
4817 } 5184 }
4818 5185
5186 if (user) {
4819#ifdef CONFIG_RT_GROUP_SCHED 5187#ifdef CONFIG_RT_GROUP_SCHED
4820 /* 5188 /*
4821 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 5190 * assigned.
4823 */ 5191 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4825 return -EPERM; 5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5194 return -EPERM;
4826#endif 5195#endif
4827 5196
4828 retval = security_task_setscheduler(p, policy, param); 5197 retval = security_task_setscheduler(p, policy, param);
4829 if (retval) 5198 if (retval)
4830 return retval; 5199 return retval;
5200 }
5201
4831 /* 5202 /*
4832 * make sure no PI-waiters arrive (or leave) while we are 5203 * make sure no PI-waiters arrive (or leave) while we are
4833 * changing the priority of the task: 5204 * changing the priority of the task:
@@ -4870,8 +5241,39 @@ recheck:
4870 5241
4871 return 0; 5242 return 0;
4872} 5243}
5244
5245/**
5246 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5247 * @p: the task in question.
5248 * @policy: new policy.
5249 * @param: structure containing the new RT priority.
5250 *
5251 * NOTE that the task may be already dead.
5252 */
5253int sched_setscheduler(struct task_struct *p, int policy,
5254 struct sched_param *param)
5255{
5256 return __sched_setscheduler(p, policy, param, true);
5257}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5258EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5259
5260/**
5261 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5262 * @p: the task in question.
5263 * @policy: new policy.
5264 * @param: structure containing the new RT priority.
5265 *
5266 * Just like sched_setscheduler, only don't bother checking if the
5267 * current context has permission. For example, this is needed in
5268 * stop_machine(): we create temporary high priority worker threads,
5269 * but our caller might not have that capability.
5270 */
5271int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5272 struct sched_param *param)
5273{
5274 return __sched_setscheduler(p, policy, param, false);
5275}
5276
4875static int 5277static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5278do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5279{
@@ -5070,24 +5472,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5472 return sched_setaffinity(pid, &new_mask);
5071} 5473}
5072 5474
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5475long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5476{
5093 struct task_struct *p; 5477 struct task_struct *p;
@@ -5384,7 +5768,7 @@ out_unlock:
5384 return retval; 5768 return retval;
5385} 5769}
5386 5770
5387static const char stat_nam[] = "RSDTtZX"; 5771static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5772
5389void sched_show_task(struct task_struct *p) 5773void sched_show_task(struct task_struct *p)
5390{ 5774{
@@ -5525,6 +5909,8 @@ static inline void sched_init_granularity(void)
5525 sysctl_sched_latency = limit; 5909 sysctl_sched_latency = limit;
5526 5910
5527 sysctl_sched_wakeup_granularity *= factor; 5911 sysctl_sched_wakeup_granularity *= factor;
5912
5913 sysctl_sched_shares_ratelimit *= factor;
5528} 5914}
5529 5915
5530#ifdef CONFIG_SMP 5916#ifdef CONFIG_SMP
@@ -5566,6 +5952,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5566 goto out; 5952 goto out;
5567 } 5953 }
5568 5954
5955 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5956 !cpus_equal(p->cpus_allowed, *new_mask))) {
5957 ret = -EINVAL;
5958 goto out;
5959 }
5960
5569 if (p->sched_class->set_cpus_allowed) 5961 if (p->sched_class->set_cpus_allowed)
5570 p->sched_class->set_cpus_allowed(p, new_mask); 5962 p->sched_class->set_cpus_allowed(p, new_mask);
5571 else { 5963 else {
@@ -5608,7 +6000,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5608 struct rq *rq_dest, *rq_src; 6000 struct rq *rq_dest, *rq_src;
5609 int ret = 0, on_rq; 6001 int ret = 0, on_rq;
5610 6002
5611 if (unlikely(cpu_is_offline(dest_cpu))) 6003 if (unlikely(!cpu_active(dest_cpu)))
5612 return ret; 6004 return ret;
5613 6005
5614 rq_src = cpu_rq(src_cpu); 6006 rq_src = cpu_rq(src_cpu);
@@ -5617,10 +6009,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5617 double_rq_lock(rq_src, rq_dest); 6009 double_rq_lock(rq_src, rq_dest);
5618 /* Already moved. */ 6010 /* Already moved. */
5619 if (task_cpu(p) != src_cpu) 6011 if (task_cpu(p) != src_cpu)
5620 goto out; 6012 goto done;
5621 /* Affinity changed (again). */ 6013 /* Affinity changed (again). */
5622 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6014 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5623 goto out; 6015 goto fail;
5624 6016
5625 on_rq = p->se.on_rq; 6017 on_rq = p->se.on_rq;
5626 if (on_rq) 6018 if (on_rq)
@@ -5629,10 +6021,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5629 set_task_cpu(p, dest_cpu); 6021 set_task_cpu(p, dest_cpu);
5630 if (on_rq) { 6022 if (on_rq) {
5631 activate_task(rq_dest, p, 0); 6023 activate_task(rq_dest, p, 0);
5632 check_preempt_curr(rq_dest, p); 6024 check_preempt_curr(rq_dest, p, 0);
5633 } 6025 }
6026done:
5634 ret = 1; 6027 ret = 1;
5635out: 6028fail:
5636 double_rq_unlock(rq_src, rq_dest); 6029 double_rq_unlock(rq_src, rq_dest);
5637 return ret; 6030 return ret;
5638} 6031}
@@ -5882,6 +6275,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5882 next = pick_next_task(rq, rq->curr); 6275 next = pick_next_task(rq, rq->curr);
5883 if (!next) 6276 if (!next)
5884 break; 6277 break;
6278 next->sched_class->put_prev_task(rq, next);
5885 migrate_dead(dead_cpu, next); 6279 migrate_dead(dead_cpu, next);
5886 6280
5887 } 6281 }
@@ -5952,7 +6346,7 @@ set_table_entry(struct ctl_table *entry,
5952static struct ctl_table * 6346static struct ctl_table *
5953sd_alloc_ctl_domain_table(struct sched_domain *sd) 6347sd_alloc_ctl_domain_table(struct sched_domain *sd)
5954{ 6348{
5955 struct ctl_table *table = sd_alloc_ctl_entry(12); 6349 struct ctl_table *table = sd_alloc_ctl_entry(13);
5956 6350
5957 if (table == NULL) 6351 if (table == NULL)
5958 return NULL; 6352 return NULL;
@@ -5980,7 +6374,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5980 sizeof(int), 0644, proc_dointvec_minmax); 6374 sizeof(int), 0644, proc_dointvec_minmax);
5981 set_table_entry(&table[10], "flags", &sd->flags, 6375 set_table_entry(&table[10], "flags", &sd->flags,
5982 sizeof(int), 0644, proc_dointvec_minmax); 6376 sizeof(int), 0644, proc_dointvec_minmax);
5983 /* &table[11] is terminator */ 6377 set_table_entry(&table[11], "name", sd->name,
6378 CORENAME_MAX_SIZE, 0444, proc_dostring);
6379 /* &table[12] is terminator */
5984 6380
5985 return table; 6381 return table;
5986} 6382}
@@ -6053,6 +6449,36 @@ static void unregister_sched_domain_sysctl(void)
6053} 6449}
6054#endif 6450#endif
6055 6451
6452static void set_rq_online(struct rq *rq)
6453{
6454 if (!rq->online) {
6455 const struct sched_class *class;
6456
6457 cpu_set(rq->cpu, rq->rd->online);
6458 rq->online = 1;
6459
6460 for_each_class(class) {
6461 if (class->rq_online)
6462 class->rq_online(rq);
6463 }
6464 }
6465}
6466
6467static void set_rq_offline(struct rq *rq)
6468{
6469 if (rq->online) {
6470 const struct sched_class *class;
6471
6472 for_each_class(class) {
6473 if (class->rq_offline)
6474 class->rq_offline(rq);
6475 }
6476
6477 cpu_clear(rq->cpu, rq->rd->online);
6478 rq->online = 0;
6479 }
6480}
6481
6056/* 6482/*
6057 * migration_call - callback that gets triggered when a CPU is added. 6483 * migration_call - callback that gets triggered when a CPU is added.
6058 * Here we can start up the necessary migration thread for the new CPU. 6484 * Here we can start up the necessary migration thread for the new CPU.
@@ -6090,7 +6516,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6090 spin_lock_irqsave(&rq->lock, flags); 6516 spin_lock_irqsave(&rq->lock, flags);
6091 if (rq->rd) { 6517 if (rq->rd) {
6092 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6518 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6093 cpu_set(cpu, rq->rd->online); 6519
6520 set_rq_online(rq);
6094 } 6521 }
6095 spin_unlock_irqrestore(&rq->lock, flags); 6522 spin_unlock_irqrestore(&rq->lock, flags);
6096 break; 6523 break;
@@ -6151,7 +6578,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6151 spin_lock_irqsave(&rq->lock, flags); 6578 spin_lock_irqsave(&rq->lock, flags);
6152 if (rq->rd) { 6579 if (rq->rd) {
6153 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6580 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6154 cpu_clear(cpu, rq->rd->online); 6581 set_rq_offline(rq);
6155 } 6582 }
6156 spin_unlock_irqrestore(&rq->lock, flags); 6583 spin_unlock_irqrestore(&rq->lock, flags);
6157 break; 6584 break;
@@ -6168,7 +6595,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
6168 .priority = 10 6595 .priority = 10
6169}; 6596};
6170 6597
6171void __init migration_init(void) 6598static int __init migration_init(void)
6172{ 6599{
6173 void *cpu = (void *)(long)smp_processor_id(); 6600 void *cpu = (void *)(long)smp_processor_id();
6174 int err; 6601 int err;
@@ -6178,13 +6605,38 @@ void __init migration_init(void)
6178 BUG_ON(err == NOTIFY_BAD); 6605 BUG_ON(err == NOTIFY_BAD);
6179 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6606 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6180 register_cpu_notifier(&migration_notifier); 6607 register_cpu_notifier(&migration_notifier);
6608
6609 return err;
6181} 6610}
6611early_initcall(migration_init);
6182#endif 6612#endif
6183 6613
6184#ifdef CONFIG_SMP 6614#ifdef CONFIG_SMP
6185 6615
6186#ifdef CONFIG_SCHED_DEBUG 6616#ifdef CONFIG_SCHED_DEBUG
6187 6617
6618static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6619{
6620 switch (lvl) {
6621 case SD_LV_NONE:
6622 return "NONE";
6623 case SD_LV_SIBLING:
6624 return "SIBLING";
6625 case SD_LV_MC:
6626 return "MC";
6627 case SD_LV_CPU:
6628 return "CPU";
6629 case SD_LV_NODE:
6630 return "NODE";
6631 case SD_LV_ALLNODES:
6632 return "ALLNODES";
6633 case SD_LV_MAX:
6634 return "MAX";
6635
6636 }
6637 return "MAX";
6638}
6639
6188static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6189 cpumask_t *groupmask) 6641 cpumask_t *groupmask)
6190{ 6642{
@@ -6204,7 +6656,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6204 return -1; 6656 return -1;
6205 } 6657 }
6206 6658
6207 printk(KERN_CONT "span %s\n", str); 6659 printk(KERN_CONT "span %s level %s\n",
6660 str, sd_level_to_string(sd->level));
6208 6661
6209 if (!cpu_isset(cpu, sd->span)) { 6662 if (!cpu_isset(cpu, sd->span)) {
6210 printk(KERN_ERR "ERROR: domain->span does not contain " 6663 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6288,9 +6741,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6288 } 6741 }
6289 kfree(groupmask); 6742 kfree(groupmask);
6290} 6743}
6291#else 6744#else /* !CONFIG_SCHED_DEBUG */
6292# define sched_domain_debug(sd, cpu) do { } while (0) 6745# define sched_domain_debug(sd, cpu) do { } while (0)
6293#endif 6746#endif /* CONFIG_SCHED_DEBUG */
6294 6747
6295static int sd_degenerate(struct sched_domain *sd) 6748static int sd_degenerate(struct sched_domain *sd)
6296{ 6749{
@@ -6350,20 +6803,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6350static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6803static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6351{ 6804{
6352 unsigned long flags; 6805 unsigned long flags;
6353 const struct sched_class *class;
6354 6806
6355 spin_lock_irqsave(&rq->lock, flags); 6807 spin_lock_irqsave(&rq->lock, flags);
6356 6808
6357 if (rq->rd) { 6809 if (rq->rd) {
6358 struct root_domain *old_rd = rq->rd; 6810 struct root_domain *old_rd = rq->rd;
6359 6811
6360 for (class = sched_class_highest; class; class = class->next) { 6812 if (cpu_isset(rq->cpu, old_rd->online))
6361 if (class->leave_domain) 6813 set_rq_offline(rq);
6362 class->leave_domain(rq);
6363 }
6364 6814
6365 cpu_clear(rq->cpu, old_rd->span); 6815 cpu_clear(rq->cpu, old_rd->span);
6366 cpu_clear(rq->cpu, old_rd->online);
6367 6816
6368 if (atomic_dec_and_test(&old_rd->refcount)) 6817 if (atomic_dec_and_test(&old_rd->refcount))
6369 kfree(old_rd); 6818 kfree(old_rd);
@@ -6374,12 +6823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6374 6823
6375 cpu_set(rq->cpu, rd->span); 6824 cpu_set(rq->cpu, rd->span);
6376 if (cpu_isset(rq->cpu, cpu_online_map)) 6825 if (cpu_isset(rq->cpu, cpu_online_map))
6377 cpu_set(rq->cpu, rd->online); 6826 set_rq_online(rq);
6378
6379 for (class = sched_class_highest; class; class = class->next) {
6380 if (class->join_domain)
6381 class->join_domain(rq);
6382 }
6383 6827
6384 spin_unlock_irqrestore(&rq->lock, flags); 6828 spin_unlock_irqrestore(&rq->lock, flags);
6385} 6829}
@@ -6390,6 +6834,8 @@ static void init_rootdomain(struct root_domain *rd)
6390 6834
6391 cpus_clear(rd->span); 6835 cpus_clear(rd->span);
6392 cpus_clear(rd->online); 6836 cpus_clear(rd->online);
6837
6838 cpupri_init(&rd->cpupri);
6393} 6839}
6394 6840
6395static void init_defrootdomain(void) 6841static void init_defrootdomain(void)
@@ -6451,7 +6897,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6451/* Setup the mask of cpus configured for isolated domains */ 6897/* Setup the mask of cpus configured for isolated domains */
6452static int __init isolated_cpu_setup(char *str) 6898static int __init isolated_cpu_setup(char *str)
6453{ 6899{
6454 int ints[NR_CPUS], i; 6900 static int __initdata ints[NR_CPUS];
6901 int i;
6455 6902
6456 str = get_options(str, ARRAY_SIZE(ints), ints); 6903 str = get_options(str, ARRAY_SIZE(ints), ints);
6457 cpus_clear(cpu_isolated_map); 6904 cpus_clear(cpu_isolated_map);
@@ -6485,7 +6932,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6485 6932
6486 cpus_clear(*covered); 6933 cpus_clear(*covered);
6487 6934
6488 for_each_cpu_mask(i, *span) { 6935 for_each_cpu_mask_nr(i, *span) {
6489 struct sched_group *sg; 6936 struct sched_group *sg;
6490 int group = group_fn(i, cpu_map, &sg, tmpmask); 6937 int group = group_fn(i, cpu_map, &sg, tmpmask);
6491 int j; 6938 int j;
@@ -6496,7 +6943,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6496 cpus_clear(sg->cpumask); 6943 cpus_clear(sg->cpumask);
6497 sg->__cpu_power = 0; 6944 sg->__cpu_power = 0;
6498 6945
6499 for_each_cpu_mask(j, *span) { 6946 for_each_cpu_mask_nr(j, *span) {
6500 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6947 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6501 continue; 6948 continue;
6502 6949
@@ -6532,9 +6979,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6532 6979
6533 min_val = INT_MAX; 6980 min_val = INT_MAX;
6534 6981
6535 for (i = 0; i < MAX_NUMNODES; i++) { 6982 for (i = 0; i < nr_node_ids; i++) {
6536 /* Start at @node */ 6983 /* Start at @node */
6537 n = (node + i) % MAX_NUMNODES; 6984 n = (node + i) % nr_node_ids;
6538 6985
6539 if (!nr_cpus_node(n)) 6986 if (!nr_cpus_node(n))
6540 continue; 6987 continue;
@@ -6584,7 +7031,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6584 cpus_or(*span, *span, *nodemask); 7031 cpus_or(*span, *span, *nodemask);
6585 } 7032 }
6586} 7033}
6587#endif 7034#endif /* CONFIG_NUMA */
6588 7035
6589int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7036int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6590 7037
@@ -6603,7 +7050,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6603 *sg = &per_cpu(sched_group_cpus, cpu); 7050 *sg = &per_cpu(sched_group_cpus, cpu);
6604 return cpu; 7051 return cpu;
6605} 7052}
6606#endif 7053#endif /* CONFIG_SCHED_SMT */
6607 7054
6608/* 7055/*
6609 * multi-core sched-domains: 7056 * multi-core sched-domains:
@@ -6611,7 +7058,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6611#ifdef CONFIG_SCHED_MC 7058#ifdef CONFIG_SCHED_MC
6612static DEFINE_PER_CPU(struct sched_domain, core_domains); 7059static DEFINE_PER_CPU(struct sched_domain, core_domains);
6613static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7060static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6614#endif 7061#endif /* CONFIG_SCHED_MC */
6615 7062
6616#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7063#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6617static int 7064static int
@@ -6696,7 +7143,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6696 if (!sg) 7143 if (!sg)
6697 return; 7144 return;
6698 do { 7145 do {
6699 for_each_cpu_mask(j, sg->cpumask) { 7146 for_each_cpu_mask_nr(j, sg->cpumask) {
6700 struct sched_domain *sd; 7147 struct sched_domain *sd;
6701 7148
6702 sd = &per_cpu(phys_domains, j); 7149 sd = &per_cpu(phys_domains, j);
@@ -6713,7 +7160,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6713 sg = sg->next; 7160 sg = sg->next;
6714 } while (sg != group_head); 7161 } while (sg != group_head);
6715} 7162}
6716#endif 7163#endif /* CONFIG_NUMA */
6717 7164
6718#ifdef CONFIG_NUMA 7165#ifdef CONFIG_NUMA
6719/* Free memory allocated for various sched_group structures */ 7166/* Free memory allocated for various sched_group structures */
@@ -6721,14 +7168,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6721{ 7168{
6722 int cpu, i; 7169 int cpu, i;
6723 7170
6724 for_each_cpu_mask(cpu, *cpu_map) { 7171 for_each_cpu_mask_nr(cpu, *cpu_map) {
6725 struct sched_group **sched_group_nodes 7172 struct sched_group **sched_group_nodes
6726 = sched_group_nodes_bycpu[cpu]; 7173 = sched_group_nodes_bycpu[cpu];
6727 7174
6728 if (!sched_group_nodes) 7175 if (!sched_group_nodes)
6729 continue; 7176 continue;
6730 7177
6731 for (i = 0; i < MAX_NUMNODES; i++) { 7178 for (i = 0; i < nr_node_ids; i++) {
6732 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7179 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6733 7180
6734 *nodemask = node_to_cpumask(i); 7181 *nodemask = node_to_cpumask(i);
@@ -6750,11 +7197,11 @@ next_sg:
6750 sched_group_nodes_bycpu[cpu] = NULL; 7197 sched_group_nodes_bycpu[cpu] = NULL;
6751 } 7198 }
6752} 7199}
6753#else 7200#else /* !CONFIG_NUMA */
6754static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7201static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6755{ 7202{
6756} 7203}
6757#endif 7204#endif /* CONFIG_NUMA */
6758 7205
6759/* 7206/*
6760 * Initialize sched groups cpu_power. 7207 * Initialize sched groups cpu_power.
@@ -6813,13 +7260,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6813 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7260 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6814 */ 7261 */
6815 7262
7263#ifdef CONFIG_SCHED_DEBUG
7264# define SD_INIT_NAME(sd, type) sd->name = #type
7265#else
7266# define SD_INIT_NAME(sd, type) do { } while (0)
7267#endif
7268
6816#define SD_INIT(sd, type) sd_init_##type(sd) 7269#define SD_INIT(sd, type) sd_init_##type(sd)
7270
6817#define SD_INIT_FUNC(type) \ 7271#define SD_INIT_FUNC(type) \
6818static noinline void sd_init_##type(struct sched_domain *sd) \ 7272static noinline void sd_init_##type(struct sched_domain *sd) \
6819{ \ 7273{ \
6820 memset(sd, 0, sizeof(*sd)); \ 7274 memset(sd, 0, sizeof(*sd)); \
6821 *sd = SD_##type##_INIT; \ 7275 *sd = SD_##type##_INIT; \
6822 sd->level = SD_LV_##type; \ 7276 sd->level = SD_LV_##type; \
7277 SD_INIT_NAME(sd, type); \
6823} 7278}
6824 7279
6825SD_INIT_FUNC(CPU) 7280SD_INIT_FUNC(CPU)
@@ -6921,7 +7376,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6921 /* 7376 /*
6922 * Allocate the per-node list of sched groups 7377 * Allocate the per-node list of sched groups
6923 */ 7378 */
6924 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7379 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6925 GFP_KERNEL); 7380 GFP_KERNEL);
6926 if (!sched_group_nodes) { 7381 if (!sched_group_nodes) {
6927 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7382 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6960,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6960 /* 7415 /*
6961 * Set up domains for cpus specified by the cpu_map. 7416 * Set up domains for cpus specified by the cpu_map.
6962 */ 7417 */
6963 for_each_cpu_mask(i, *cpu_map) { 7418 for_each_cpu_mask_nr(i, *cpu_map) {
6964 struct sched_domain *sd = NULL, *p; 7419 struct sched_domain *sd = NULL, *p;
6965 SCHED_CPUMASK_VAR(nodemask, allmasks); 7420 SCHED_CPUMASK_VAR(nodemask, allmasks);
6966 7421
@@ -7027,7 +7482,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7027 7482
7028#ifdef CONFIG_SCHED_SMT 7483#ifdef CONFIG_SCHED_SMT
7029 /* Set up CPU (sibling) groups */ 7484 /* Set up CPU (sibling) groups */
7030 for_each_cpu_mask(i, *cpu_map) { 7485 for_each_cpu_mask_nr(i, *cpu_map) {
7031 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7486 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7032 SCHED_CPUMASK_VAR(send_covered, allmasks); 7487 SCHED_CPUMASK_VAR(send_covered, allmasks);
7033 7488
@@ -7044,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7044 7499
7045#ifdef CONFIG_SCHED_MC 7500#ifdef CONFIG_SCHED_MC
7046 /* Set up multi-core groups */ 7501 /* Set up multi-core groups */
7047 for_each_cpu_mask(i, *cpu_map) { 7502 for_each_cpu_mask_nr(i, *cpu_map) {
7048 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7503 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7049 SCHED_CPUMASK_VAR(send_covered, allmasks); 7504 SCHED_CPUMASK_VAR(send_covered, allmasks);
7050 7505
@@ -7060,7 +7515,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7060#endif 7515#endif
7061 7516
7062 /* Set up physical groups */ 7517 /* Set up physical groups */
7063 for (i = 0; i < MAX_NUMNODES; i++) { 7518 for (i = 0; i < nr_node_ids; i++) {
7064 SCHED_CPUMASK_VAR(nodemask, allmasks); 7519 SCHED_CPUMASK_VAR(nodemask, allmasks);
7065 SCHED_CPUMASK_VAR(send_covered, allmasks); 7520 SCHED_CPUMASK_VAR(send_covered, allmasks);
7066 7521
@@ -7084,7 +7539,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7084 send_covered, tmpmask); 7539 send_covered, tmpmask);
7085 } 7540 }
7086 7541
7087 for (i = 0; i < MAX_NUMNODES; i++) { 7542 for (i = 0; i < nr_node_ids; i++) {
7088 /* Set up node groups */ 7543 /* Set up node groups */
7089 struct sched_group *sg, *prev; 7544 struct sched_group *sg, *prev;
7090 SCHED_CPUMASK_VAR(nodemask, allmasks); 7545 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7111,7 +7566,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7111 goto error; 7566 goto error;
7112 } 7567 }
7113 sched_group_nodes[i] = sg; 7568 sched_group_nodes[i] = sg;
7114 for_each_cpu_mask(j, *nodemask) { 7569 for_each_cpu_mask_nr(j, *nodemask) {
7115 struct sched_domain *sd; 7570 struct sched_domain *sd;
7116 7571
7117 sd = &per_cpu(node_domains, j); 7572 sd = &per_cpu(node_domains, j);
@@ -7123,9 +7578,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7123 cpus_or(*covered, *covered, *nodemask); 7578 cpus_or(*covered, *covered, *nodemask);
7124 prev = sg; 7579 prev = sg;
7125 7580
7126 for (j = 0; j < MAX_NUMNODES; j++) { 7581 for (j = 0; j < nr_node_ids; j++) {
7127 SCHED_CPUMASK_VAR(notcovered, allmasks); 7582 SCHED_CPUMASK_VAR(notcovered, allmasks);
7128 int n = (i + j) % MAX_NUMNODES; 7583 int n = (i + j) % nr_node_ids;
7129 node_to_cpumask_ptr(pnodemask, n); 7584 node_to_cpumask_ptr(pnodemask, n);
7130 7585
7131 cpus_complement(*notcovered, *covered); 7586 cpus_complement(*notcovered, *covered);
@@ -7157,28 +7612,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7157 7612
7158 /* Calculate CPU power for physical packages and nodes */ 7613 /* Calculate CPU power for physical packages and nodes */
7159#ifdef CONFIG_SCHED_SMT 7614#ifdef CONFIG_SCHED_SMT
7160 for_each_cpu_mask(i, *cpu_map) { 7615 for_each_cpu_mask_nr(i, *cpu_map) {
7161 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7616 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7162 7617
7163 init_sched_groups_power(i, sd); 7618 init_sched_groups_power(i, sd);
7164 } 7619 }
7165#endif 7620#endif
7166#ifdef CONFIG_SCHED_MC 7621#ifdef CONFIG_SCHED_MC
7167 for_each_cpu_mask(i, *cpu_map) { 7622 for_each_cpu_mask_nr(i, *cpu_map) {
7168 struct sched_domain *sd = &per_cpu(core_domains, i); 7623 struct sched_domain *sd = &per_cpu(core_domains, i);
7169 7624
7170 init_sched_groups_power(i, sd); 7625 init_sched_groups_power(i, sd);
7171 } 7626 }
7172#endif 7627#endif
7173 7628
7174 for_each_cpu_mask(i, *cpu_map) { 7629 for_each_cpu_mask_nr(i, *cpu_map) {
7175 struct sched_domain *sd = &per_cpu(phys_domains, i); 7630 struct sched_domain *sd = &per_cpu(phys_domains, i);
7176 7631
7177 init_sched_groups_power(i, sd); 7632 init_sched_groups_power(i, sd);
7178 } 7633 }
7179 7634
7180#ifdef CONFIG_NUMA 7635#ifdef CONFIG_NUMA
7181 for (i = 0; i < MAX_NUMNODES; i++) 7636 for (i = 0; i < nr_node_ids; i++)
7182 init_numa_sched_groups_power(sched_group_nodes[i]); 7637 init_numa_sched_groups_power(sched_group_nodes[i]);
7183 7638
7184 if (sd_allnodes) { 7639 if (sd_allnodes) {
@@ -7191,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7191#endif 7646#endif
7192 7647
7193 /* Attach the domains */ 7648 /* Attach the domains */
7194 for_each_cpu_mask(i, *cpu_map) { 7649 for_each_cpu_mask_nr(i, *cpu_map) {
7195 struct sched_domain *sd; 7650 struct sched_domain *sd;
7196#ifdef CONFIG_SCHED_SMT 7651#ifdef CONFIG_SCHED_SMT
7197 sd = &per_cpu(cpu_domains, i); 7652 sd = &per_cpu(cpu_domains, i);
@@ -7236,18 +7691,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7236} 7691}
7237 7692
7238/* 7693/*
7239 * Free current domain masks.
7240 * Called after all cpus are attached to NULL domain.
7241 */
7242static void free_sched_domains(void)
7243{
7244 ndoms_cur = 0;
7245 if (doms_cur != &fallback_doms)
7246 kfree(doms_cur);
7247 doms_cur = &fallback_doms;
7248}
7249
7250/*
7251 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7694 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7252 * For now this just excludes isolated cpus, but could be used to 7695 * For now this just excludes isolated cpus, but could be used to
7253 * exclude other special cases in the future. 7696 * exclude other special cases in the future.
@@ -7286,7 +7729,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7286 7729
7287 unregister_sched_domain_sysctl(); 7730 unregister_sched_domain_sysctl();
7288 7731
7289 for_each_cpu_mask(i, *cpu_map) 7732 for_each_cpu_mask_nr(i, *cpu_map)
7290 cpu_attach_domain(NULL, &def_root_domain, i); 7733 cpu_attach_domain(NULL, &def_root_domain, i);
7291 synchronize_sched(); 7734 synchronize_sched();
7292 arch_destroy_sched_domains(cpu_map, &tmpmask); 7735 arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7325,30 +7768,29 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7325 * ownership of it and will kfree it when done with it. If the caller 7768 * ownership of it and will kfree it when done with it. If the caller
7326 * failed the kmalloc call, then it can pass in doms_new == NULL, 7769 * failed the kmalloc call, then it can pass in doms_new == NULL,
7327 * and partition_sched_domains() will fallback to the single partition 7770 * and partition_sched_domains() will fallback to the single partition
7328 * 'fallback_doms'. 7771 * 'fallback_doms', it also forces the domains to be rebuilt.
7772 *
7773 * If doms_new==NULL it will be replaced with cpu_online_map.
7774 * ndoms_new==0 is a special case for destroying existing domains.
7775 * It will not create the default domain.
7329 * 7776 *
7330 * Call with hotplug lock held 7777 * Call with hotplug lock held
7331 */ 7778 */
7332void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7779void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7333 struct sched_domain_attr *dattr_new) 7780 struct sched_domain_attr *dattr_new)
7334{ 7781{
7335 int i, j; 7782 int i, j, n;
7336 7783
7337 mutex_lock(&sched_domains_mutex); 7784 mutex_lock(&sched_domains_mutex);
7338 7785
7339 /* always unregister in case we don't destroy any domains */ 7786 /* always unregister in case we don't destroy any domains */
7340 unregister_sched_domain_sysctl(); 7787 unregister_sched_domain_sysctl();
7341 7788
7342 if (doms_new == NULL) { 7789 n = doms_new ? ndoms_new : 0;
7343 ndoms_new = 1;
7344 doms_new = &fallback_doms;
7345 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7346 dattr_new = NULL;
7347 }
7348 7790
7349 /* Destroy deleted domains */ 7791 /* Destroy deleted domains */
7350 for (i = 0; i < ndoms_cur; i++) { 7792 for (i = 0; i < ndoms_cur; i++) {
7351 for (j = 0; j < ndoms_new; j++) { 7793 for (j = 0; j < n; j++) {
7352 if (cpus_equal(doms_cur[i], doms_new[j]) 7794 if (cpus_equal(doms_cur[i], doms_new[j])
7353 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7795 && dattrs_equal(dattr_cur, i, dattr_new, j))
7354 goto match1; 7796 goto match1;
@@ -7359,6 +7801,13 @@ match1:
7359 ; 7801 ;
7360 } 7802 }
7361 7803
7804 if (doms_new == NULL) {
7805 ndoms_cur = 0;
7806 doms_new = &fallback_doms;
7807 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7808 dattr_new = NULL;
7809 }
7810
7362 /* Build new domains */ 7811 /* Build new domains */
7363 for (i = 0; i < ndoms_new; i++) { 7812 for (i = 0; i < ndoms_new; i++) {
7364 for (j = 0; j < ndoms_cur; j++) { 7813 for (j = 0; j < ndoms_cur; j++) {
@@ -7389,17 +7838,15 @@ match2:
7389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7838#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7390int arch_reinit_sched_domains(void) 7839int arch_reinit_sched_domains(void)
7391{ 7840{
7392 int err;
7393
7394 get_online_cpus(); 7841 get_online_cpus();
7395 mutex_lock(&sched_domains_mutex); 7842
7396 detach_destroy_domains(&cpu_online_map); 7843 /* Destroy domains first to force the rebuild */
7397 free_sched_domains(); 7844 partition_sched_domains(0, NULL, NULL);
7398 err = arch_init_sched_domains(&cpu_online_map); 7845
7399 mutex_unlock(&sched_domains_mutex); 7846 rebuild_sched_domains();
7400 put_online_cpus(); 7847 put_online_cpus();
7401 7848
7402 return err; 7849 return 0;
7403} 7850}
7404 7851
7405static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7852static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7420,30 +7867,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7420} 7867}
7421 7868
7422#ifdef CONFIG_SCHED_MC 7869#ifdef CONFIG_SCHED_MC
7423static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 7870static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7871 char *page)
7424{ 7872{
7425 return sprintf(page, "%u\n", sched_mc_power_savings); 7873 return sprintf(page, "%u\n", sched_mc_power_savings);
7426} 7874}
7427static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7875static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7428 const char *buf, size_t count) 7876 const char *buf, size_t count)
7429{ 7877{
7430 return sched_power_savings_store(buf, count, 0); 7878 return sched_power_savings_store(buf, count, 0);
7431} 7879}
7432static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 7880static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7433 sched_mc_power_savings_store); 7881 sched_mc_power_savings_show,
7882 sched_mc_power_savings_store);
7434#endif 7883#endif
7435 7884
7436#ifdef CONFIG_SCHED_SMT 7885#ifdef CONFIG_SCHED_SMT
7437static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 7886static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7887 char *page)
7438{ 7888{
7439 return sprintf(page, "%u\n", sched_smt_power_savings); 7889 return sprintf(page, "%u\n", sched_smt_power_savings);
7440} 7890}
7441static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7891static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7442 const char *buf, size_t count) 7892 const char *buf, size_t count)
7443{ 7893{
7444 return sched_power_savings_store(buf, count, 1); 7894 return sched_power_savings_store(buf, count, 1);
7445} 7895}
7446static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 7896static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7897 sched_smt_power_savings_show,
7447 sched_smt_power_savings_store); 7898 sched_smt_power_savings_store);
7448#endif 7899#endif
7449 7900
@@ -7463,54 +7914,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7463#endif 7914#endif
7464 return err; 7915 return err;
7465} 7916}
7466#endif 7917#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7467 7918
7919#ifndef CONFIG_CPUSETS
7468/* 7920/*
7469 * Force a reinitialization of the sched domains hierarchy. The domains 7921 * Add online and remove offline CPUs from the scheduler domains.
7470 * and groups cannot be updated in place without racing with the balancing 7922 * When cpusets are enabled they take over this function.
7471 * code, so we temporarily attach all running cpus to the NULL domain
7472 * which will prevent rebalancing while the sched domains are recalculated.
7473 */ 7923 */
7474static int update_sched_domains(struct notifier_block *nfb, 7924static int update_sched_domains(struct notifier_block *nfb,
7475 unsigned long action, void *hcpu) 7925 unsigned long action, void *hcpu)
7476{ 7926{
7477 switch (action) { 7927 switch (action) {
7478 case CPU_UP_PREPARE: 7928 case CPU_ONLINE:
7479 case CPU_UP_PREPARE_FROZEN: 7929 case CPU_ONLINE_FROZEN:
7930 case CPU_DEAD:
7931 case CPU_DEAD_FROZEN:
7932 partition_sched_domains(1, NULL, NULL);
7933 return NOTIFY_OK;
7934
7935 default:
7936 return NOTIFY_DONE;
7937 }
7938}
7939#endif
7940
7941static int update_runtime(struct notifier_block *nfb,
7942 unsigned long action, void *hcpu)
7943{
7944 int cpu = (int)(long)hcpu;
7945
7946 switch (action) {
7480 case CPU_DOWN_PREPARE: 7947 case CPU_DOWN_PREPARE:
7481 case CPU_DOWN_PREPARE_FROZEN: 7948 case CPU_DOWN_PREPARE_FROZEN:
7482 detach_destroy_domains(&cpu_online_map); 7949 disable_runtime(cpu_rq(cpu));
7483 free_sched_domains();
7484 return NOTIFY_OK; 7950 return NOTIFY_OK;
7485 7951
7486 case CPU_UP_CANCELED:
7487 case CPU_UP_CANCELED_FROZEN:
7488 case CPU_DOWN_FAILED: 7952 case CPU_DOWN_FAILED:
7489 case CPU_DOWN_FAILED_FROZEN: 7953 case CPU_DOWN_FAILED_FROZEN:
7490 case CPU_ONLINE: 7954 case CPU_ONLINE:
7491 case CPU_ONLINE_FROZEN: 7955 case CPU_ONLINE_FROZEN:
7492 case CPU_DEAD: 7956 enable_runtime(cpu_rq(cpu));
7493 case CPU_DEAD_FROZEN: 7957 return NOTIFY_OK;
7494 /* 7958
7495 * Fall through and re-initialise the domains.
7496 */
7497 break;
7498 default: 7959 default:
7499 return NOTIFY_DONE; 7960 return NOTIFY_DONE;
7500 } 7961 }
7501
7502#ifndef CONFIG_CPUSETS
7503 /*
7504 * Create default domain partitioning if cpusets are disabled.
7505 * Otherwise we let cpusets rebuild the domains based on the
7506 * current setup.
7507 */
7508
7509 /* The hotplug lock is already held by cpu_up/cpu_down */
7510 arch_init_sched_domains(&cpu_online_map);
7511#endif
7512
7513 return NOTIFY_OK;
7514} 7962}
7515 7963
7516void __init sched_init_smp(void) 7964void __init sched_init_smp(void)
@@ -7530,8 +7978,15 @@ void __init sched_init_smp(void)
7530 cpu_set(smp_processor_id(), non_isolated_cpus); 7978 cpu_set(smp_processor_id(), non_isolated_cpus);
7531 mutex_unlock(&sched_domains_mutex); 7979 mutex_unlock(&sched_domains_mutex);
7532 put_online_cpus(); 7980 put_online_cpus();
7981
7982#ifndef CONFIG_CPUSETS
7533 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7983 /* XXX: Theoretical race here - CPU may be hotplugged now */
7534 hotcpu_notifier(update_sched_domains, 0); 7984 hotcpu_notifier(update_sched_domains, 0);
7985#endif
7986
7987 /* RT runtime code needs to handle some hotplug events */
7988 hotcpu_notifier(update_runtime, 0);
7989
7535 init_hrtick(); 7990 init_hrtick();
7536 7991
7537 /* Move init over to a non-isolated CPU */ 7992 /* Move init over to a non-isolated CPU */
@@ -7688,8 +8143,8 @@ void __init sched_init(void)
7688 8143
7689 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8144 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7690 ptr += nr_cpu_ids * sizeof(void **); 8145 ptr += nr_cpu_ids * sizeof(void **);
7691#endif 8146#endif /* CONFIG_USER_SCHED */
7692#endif 8147#endif /* CONFIG_FAIR_GROUP_SCHED */
7693#ifdef CONFIG_RT_GROUP_SCHED 8148#ifdef CONFIG_RT_GROUP_SCHED
7694 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8149 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7695 ptr += nr_cpu_ids * sizeof(void **); 8150 ptr += nr_cpu_ids * sizeof(void **);
@@ -7703,8 +8158,8 @@ void __init sched_init(void)
7703 8158
7704 root_task_group.rt_rq = (struct rt_rq **)ptr; 8159 root_task_group.rt_rq = (struct rt_rq **)ptr;
7705 ptr += nr_cpu_ids * sizeof(void **); 8160 ptr += nr_cpu_ids * sizeof(void **);
7706#endif 8161#endif /* CONFIG_USER_SCHED */
7707#endif 8162#endif /* CONFIG_RT_GROUP_SCHED */
7708 } 8163 }
7709 8164
7710#ifdef CONFIG_SMP 8165#ifdef CONFIG_SMP
@@ -7720,8 +8175,8 @@ void __init sched_init(void)
7720#ifdef CONFIG_USER_SCHED 8175#ifdef CONFIG_USER_SCHED
7721 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8176 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7722 global_rt_period(), RUNTIME_INF); 8177 global_rt_period(), RUNTIME_INF);
7723#endif 8178#endif /* CONFIG_USER_SCHED */
7724#endif 8179#endif /* CONFIG_RT_GROUP_SCHED */
7725 8180
7726#ifdef CONFIG_GROUP_SCHED 8181#ifdef CONFIG_GROUP_SCHED
7727 list_add(&init_task_group.list, &task_groups); 8182 list_add(&init_task_group.list, &task_groups);
@@ -7731,15 +8186,14 @@ void __init sched_init(void)
7731 INIT_LIST_HEAD(&root_task_group.children); 8186 INIT_LIST_HEAD(&root_task_group.children);
7732 init_task_group.parent = &root_task_group; 8187 init_task_group.parent = &root_task_group;
7733 list_add(&init_task_group.siblings, &root_task_group.children); 8188 list_add(&init_task_group.siblings, &root_task_group.children);
7734#endif 8189#endif /* CONFIG_USER_SCHED */
7735#endif 8190#endif /* CONFIG_GROUP_SCHED */
7736 8191
7737 for_each_possible_cpu(i) { 8192 for_each_possible_cpu(i) {
7738 struct rq *rq; 8193 struct rq *rq;
7739 8194
7740 rq = cpu_rq(i); 8195 rq = cpu_rq(i);
7741 spin_lock_init(&rq->lock); 8196 spin_lock_init(&rq->lock);
7742 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7743 rq->nr_running = 0; 8197 rq->nr_running = 0;
7744 init_cfs_rq(&rq->cfs, rq); 8198 init_cfs_rq(&rq->cfs, rq);
7745 init_rt_rq(&rq->rt, rq); 8199 init_rt_rq(&rq->rt, rq);
@@ -7812,6 +8266,7 @@ void __init sched_init(void)
7812 rq->next_balance = jiffies; 8266 rq->next_balance = jiffies;
7813 rq->push_cpu = 0; 8267 rq->push_cpu = 0;
7814 rq->cpu = i; 8268 rq->cpu = i;
8269 rq->online = 0;
7815 rq->migration_thread = NULL; 8270 rq->migration_thread = NULL;
7816 INIT_LIST_HEAD(&rq->migration_queue); 8271 INIT_LIST_HEAD(&rq->migration_queue);
7817 rq_attach_root(rq, &def_root_domain); 8272 rq_attach_root(rq, &def_root_domain);
@@ -7827,7 +8282,7 @@ void __init sched_init(void)
7827#endif 8282#endif
7828 8283
7829#ifdef CONFIG_SMP 8284#ifdef CONFIG_SMP
7830 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8285 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7831#endif 8286#endif
7832 8287
7833#ifdef CONFIG_RT_MUTEXES 8288#ifdef CONFIG_RT_MUTEXES
@@ -7861,20 +8316,25 @@ void __might_sleep(char *file, int line)
7861#ifdef in_atomic 8316#ifdef in_atomic
7862 static unsigned long prev_jiffy; /* ratelimiting */ 8317 static unsigned long prev_jiffy; /* ratelimiting */
7863 8318
7864 if ((in_atomic() || irqs_disabled()) && 8319 if ((!in_atomic() && !irqs_disabled()) ||
7865 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8320 system_state != SYSTEM_RUNNING || oops_in_progress)
7866 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8321 return;
7867 return; 8322 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7868 prev_jiffy = jiffies; 8323 return;
7869 printk(KERN_ERR "BUG: sleeping function called from invalid" 8324 prev_jiffy = jiffies;
7870 " context at %s:%d\n", file, line); 8325
7871 printk("in_atomic():%d, irqs_disabled():%d\n", 8326 printk(KERN_ERR
7872 in_atomic(), irqs_disabled()); 8327 "BUG: sleeping function called from invalid context at %s:%d\n",
7873 debug_show_held_locks(current); 8328 file, line);
7874 if (irqs_disabled()) 8329 printk(KERN_ERR
7875 print_irqtrace_events(current); 8330 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7876 dump_stack(); 8331 in_atomic(), irqs_disabled(),
7877 } 8332 current->pid, current->comm);
8333
8334 debug_show_held_locks(current);
8335 if (irqs_disabled())
8336 print_irqtrace_events(current);
8337 dump_stack();
7878#endif 8338#endif
7879} 8339}
7880EXPORT_SYMBOL(__might_sleep); 8340EXPORT_SYMBOL(__might_sleep);
@@ -8051,7 +8511,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8051{ 8511{
8052 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8512 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8053} 8513}
8054#else 8514#else /* !CONFG_FAIR_GROUP_SCHED */
8055static inline void free_fair_sched_group(struct task_group *tg) 8515static inline void free_fair_sched_group(struct task_group *tg)
8056{ 8516{
8057} 8517}
@@ -8069,7 +8529,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8069static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8529static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8070{ 8530{
8071} 8531}
8072#endif 8532#endif /* CONFIG_FAIR_GROUP_SCHED */
8073 8533
8074#ifdef CONFIG_RT_GROUP_SCHED 8534#ifdef CONFIG_RT_GROUP_SCHED
8075static void free_rt_sched_group(struct task_group *tg) 8535static void free_rt_sched_group(struct task_group *tg)
@@ -8140,7 +8600,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8140{ 8600{
8141 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8601 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8142} 8602}
8143#else 8603#else /* !CONFIG_RT_GROUP_SCHED */
8144static inline void free_rt_sched_group(struct task_group *tg) 8604static inline void free_rt_sched_group(struct task_group *tg)
8145{ 8605{
8146} 8606}
@@ -8158,7 +8618,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8158static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8618static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8159{ 8619{
8160} 8620}
8161#endif 8621#endif /* CONFIG_RT_GROUP_SCHED */
8162 8622
8163#ifdef CONFIG_GROUP_SCHED 8623#ifdef CONFIG_GROUP_SCHED
8164static void free_sched_group(struct task_group *tg) 8624static void free_sched_group(struct task_group *tg)
@@ -8195,8 +8655,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8195 WARN_ON(!parent); /* root should already exist */ 8655 WARN_ON(!parent); /* root should already exist */
8196 8656
8197 tg->parent = parent; 8657 tg->parent = parent;
8198 list_add_rcu(&tg->siblings, &parent->children);
8199 INIT_LIST_HEAD(&tg->children); 8658 INIT_LIST_HEAD(&tg->children);
8659 list_add_rcu(&tg->siblings, &parent->children);
8200 spin_unlock_irqrestore(&task_group_lock, flags); 8660 spin_unlock_irqrestore(&task_group_lock, flags);
8201 8661
8202 return tg; 8662 return tg;
@@ -8269,17 +8729,14 @@ void sched_move_task(struct task_struct *tsk)
8269 8729
8270 task_rq_unlock(rq, &flags); 8730 task_rq_unlock(rq, &flags);
8271} 8731}
8272#endif 8732#endif /* CONFIG_GROUP_SCHED */
8273 8733
8274#ifdef CONFIG_FAIR_GROUP_SCHED 8734#ifdef CONFIG_FAIR_GROUP_SCHED
8275static void set_se_shares(struct sched_entity *se, unsigned long shares) 8735static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8276{ 8736{
8277 struct cfs_rq *cfs_rq = se->cfs_rq; 8737 struct cfs_rq *cfs_rq = se->cfs_rq;
8278 struct rq *rq = cfs_rq->rq;
8279 int on_rq; 8738 int on_rq;
8280 8739
8281 spin_lock_irq(&rq->lock);
8282
8283 on_rq = se->on_rq; 8740 on_rq = se->on_rq;
8284 if (on_rq) 8741 if (on_rq)
8285 dequeue_entity(cfs_rq, se, 0); 8742 dequeue_entity(cfs_rq, se, 0);
@@ -8289,8 +8746,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8289 8746
8290 if (on_rq) 8747 if (on_rq)
8291 enqueue_entity(cfs_rq, se, 0); 8748 enqueue_entity(cfs_rq, se, 0);
8749}
8292 8750
8293 spin_unlock_irq(&rq->lock); 8751static void set_se_shares(struct sched_entity *se, unsigned long shares)
8752{
8753 struct cfs_rq *cfs_rq = se->cfs_rq;
8754 struct rq *rq = cfs_rq->rq;
8755 unsigned long flags;
8756
8757 spin_lock_irqsave(&rq->lock, flags);
8758 __set_se_shares(se, shares);
8759 spin_unlock_irqrestore(&rq->lock, flags);
8294} 8760}
8295 8761
8296static DEFINE_MUTEX(shares_mutex); 8762static DEFINE_MUTEX(shares_mutex);
@@ -8329,8 +8795,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8329 * w/o tripping rebalance_share or load_balance_fair. 8795 * w/o tripping rebalance_share or load_balance_fair.
8330 */ 8796 */
8331 tg->shares = shares; 8797 tg->shares = shares;
8332 for_each_possible_cpu(i) 8798 for_each_possible_cpu(i) {
8799 /*
8800 * force a rebalance
8801 */
8802 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8333 set_se_shares(tg->se[i], shares); 8803 set_se_shares(tg->se[i], shares);
8804 }
8334 8805
8335 /* 8806 /*
8336 * Enable load balance activity on this group, by inserting it back on 8807 * Enable load balance activity on this group, by inserting it back on
@@ -8361,73 +8832,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8361static unsigned long to_ratio(u64 period, u64 runtime) 8832static unsigned long to_ratio(u64 period, u64 runtime)
8362{ 8833{
8363 if (runtime == RUNTIME_INF) 8834 if (runtime == RUNTIME_INF)
8364 return 1ULL << 16; 8835 return 1ULL << 20;
8365 8836
8366 return div64_u64(runtime << 16, period); 8837 return div64_u64(runtime << 20, period);
8367} 8838}
8368 8839
8369#ifdef CONFIG_CGROUP_SCHED 8840/* Must be called with tasklist_lock held */
8370static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8841static inline int tg_has_rt_tasks(struct task_group *tg)
8371{ 8842{
8372 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8843 struct task_struct *g, *p;
8373 unsigned long total = 0;
8374 8844
8375 if (!parent) { 8845 do_each_thread(g, p) {
8376 if (global_rt_period() < period) 8846 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8377 return 0; 8847 return 1;
8848 } while_each_thread(g, p);
8378 8849
8379 return to_ratio(period, runtime) < 8850 return 0;
8380 to_ratio(global_rt_period(), global_rt_runtime()); 8851}
8381 }
8382 8852
8383 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8853struct rt_schedulable_data {
8384 return 0; 8854 struct task_group *tg;
8855 u64 rt_period;
8856 u64 rt_runtime;
8857};
8385 8858
8386 rcu_read_lock(); 8859static int tg_schedulable(struct task_group *tg, void *data)
8387 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8860{
8388 if (tgi == tg) 8861 struct rt_schedulable_data *d = data;
8389 continue; 8862 struct task_group *child;
8863 unsigned long total, sum = 0;
8864 u64 period, runtime;
8865
8866 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8867 runtime = tg->rt_bandwidth.rt_runtime;
8390 8868
8391 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8869 if (tg == d->tg) {
8392 tgi->rt_bandwidth.rt_runtime); 8870 period = d->rt_period;
8871 runtime = d->rt_runtime;
8393 } 8872 }
8394 rcu_read_unlock();
8395 8873
8396 return total + to_ratio(period, runtime) < 8874 /*
8397 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8875 * Cannot have more runtime than the period.
8398 parent->rt_bandwidth.rt_runtime); 8876 */
8399} 8877 if (runtime > period && runtime != RUNTIME_INF)
8400#elif defined CONFIG_USER_SCHED 8878 return -EINVAL;
8401static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8402{
8403 struct task_group *tgi;
8404 unsigned long total = 0;
8405 unsigned long global_ratio =
8406 to_ratio(global_rt_period(), global_rt_runtime());
8407 8879
8408 rcu_read_lock(); 8880 /*
8409 list_for_each_entry_rcu(tgi, &task_groups, list) { 8881 * Ensure we don't starve existing RT tasks.
8410 if (tgi == tg) 8882 */
8411 continue; 8883 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8884 return -EBUSY;
8885
8886 total = to_ratio(period, runtime);
8412 8887
8413 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8888 /*
8414 tgi->rt_bandwidth.rt_runtime); 8889 * Nobody can have more than the global setting allows.
8890 */
8891 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8892 return -EINVAL;
8893
8894 /*
8895 * The sum of our children's runtime should not exceed our own.
8896 */
8897 list_for_each_entry_rcu(child, &tg->children, siblings) {
8898 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8899 runtime = child->rt_bandwidth.rt_runtime;
8900
8901 if (child == d->tg) {
8902 period = d->rt_period;
8903 runtime = d->rt_runtime;
8904 }
8905
8906 sum += to_ratio(period, runtime);
8415 } 8907 }
8416 rcu_read_unlock();
8417 8908
8418 return total + to_ratio(period, runtime) < global_ratio; 8909 if (sum > total)
8910 return -EINVAL;
8911
8912 return 0;
8419} 8913}
8420#endif
8421 8914
8422/* Must be called with tasklist_lock held */ 8915static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8423static inline int tg_has_rt_tasks(struct task_group *tg)
8424{ 8916{
8425 struct task_struct *g, *p; 8917 struct rt_schedulable_data data = {
8426 do_each_thread(g, p) { 8918 .tg = tg,
8427 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8919 .rt_period = period,
8428 return 1; 8920 .rt_runtime = runtime,
8429 } while_each_thread(g, p); 8921 };
8430 return 0; 8922
8923 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8431} 8924}
8432 8925
8433static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_bandwidth(struct task_group *tg,
@@ -8437,14 +8930,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8437 8930
8438 mutex_lock(&rt_constraints_mutex); 8931 mutex_lock(&rt_constraints_mutex);
8439 read_lock(&tasklist_lock); 8932 read_lock(&tasklist_lock);
8440 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8933 err = __rt_schedulable(tg, rt_period, rt_runtime);
8441 err = -EBUSY; 8934 if (err)
8442 goto unlock; 8935 goto unlock;
8443 }
8444 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8445 err = -EINVAL;
8446 goto unlock;
8447 }
8448 8936
8449 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8937 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8450 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8938 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8496,6 +8984,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8496 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8984 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8497 rt_runtime = tg->rt_bandwidth.rt_runtime; 8985 rt_runtime = tg->rt_bandwidth.rt_runtime;
8498 8986
8987 if (rt_period == 0)
8988 return -EINVAL;
8989
8499 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8500} 8991}
8501 8992
@@ -8510,21 +9001,38 @@ long sched_group_rt_period(struct task_group *tg)
8510 9001
8511static int sched_rt_global_constraints(void) 9002static int sched_rt_global_constraints(void)
8512{ 9003{
9004 u64 runtime, period;
8513 int ret = 0; 9005 int ret = 0;
8514 9006
9007 if (sysctl_sched_rt_period <= 0)
9008 return -EINVAL;
9009
9010 runtime = global_rt_runtime();
9011 period = global_rt_period();
9012
9013 /*
9014 * Sanity check on the sysctl variables.
9015 */
9016 if (runtime > period && runtime != RUNTIME_INF)
9017 return -EINVAL;
9018
8515 mutex_lock(&rt_constraints_mutex); 9019 mutex_lock(&rt_constraints_mutex);
8516 if (!__rt_schedulable(NULL, 1, 0)) 9020 read_lock(&tasklist_lock);
8517 ret = -EINVAL; 9021 ret = __rt_schedulable(NULL, 0, 0);
9022 read_unlock(&tasklist_lock);
8518 mutex_unlock(&rt_constraints_mutex); 9023 mutex_unlock(&rt_constraints_mutex);
8519 9024
8520 return ret; 9025 return ret;
8521} 9026}
8522#else 9027#else /* !CONFIG_RT_GROUP_SCHED */
8523static int sched_rt_global_constraints(void) 9028static int sched_rt_global_constraints(void)
8524{ 9029{
8525 unsigned long flags; 9030 unsigned long flags;
8526 int i; 9031 int i;
8527 9032
9033 if (sysctl_sched_rt_period <= 0)
9034 return -EINVAL;
9035
8528 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9036 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8529 for_each_possible_cpu(i) { 9037 for_each_possible_cpu(i) {
8530 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9038 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8537,7 +9045,7 @@ static int sched_rt_global_constraints(void)
8537 9045
8538 return 0; 9046 return 0;
8539} 9047}
8540#endif 9048#endif /* CONFIG_RT_GROUP_SCHED */
8541 9049
8542int sched_rt_handler(struct ctl_table *table, int write, 9050int sched_rt_handler(struct ctl_table *table, int write,
8543 struct file *filp, void __user *buffer, size_t *lenp, 9051 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8585,7 +9093,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8585 9093
8586 if (!cgrp->parent) { 9094 if (!cgrp->parent) {
8587 /* This is early initialization for the top cgroup */ 9095 /* This is early initialization for the top cgroup */
8588 init_task_group.css.cgroup = cgrp;
8589 return &init_task_group.css; 9096 return &init_task_group.css;
8590 } 9097 }
8591 9098
@@ -8594,9 +9101,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8594 if (IS_ERR(tg)) 9101 if (IS_ERR(tg))
8595 return ERR_PTR(-ENOMEM); 9102 return ERR_PTR(-ENOMEM);
8596 9103
8597 /* Bind the cgroup to task_group object we just created */
8598 tg->css.cgroup = cgrp;
8599
8600 return &tg->css; 9104 return &tg->css;
8601} 9105}
8602 9106
@@ -8645,7 +9149,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8645 9149
8646 return (u64) tg->shares; 9150 return (u64) tg->shares;
8647} 9151}
8648#endif 9152#endif /* CONFIG_FAIR_GROUP_SCHED */
8649 9153
8650#ifdef CONFIG_RT_GROUP_SCHED 9154#ifdef CONFIG_RT_GROUP_SCHED
8651static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9155static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8669,7 +9173,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8669{ 9173{
8670 return sched_group_rt_period(cgroup_tg(cgrp)); 9174 return sched_group_rt_period(cgroup_tg(cgrp));
8671} 9175}
8672#endif 9176#endif /* CONFIG_RT_GROUP_SCHED */
8673 9177
8674static struct cftype cpu_files[] = { 9178static struct cftype cpu_files[] = {
8675#ifdef CONFIG_FAIR_GROUP_SCHED 9179#ifdef CONFIG_FAIR_GROUP_SCHED