aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1179
1 files changed, 709 insertions, 470 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 4e2f60335656..6acf749d3336 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0 or 1 can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 366#else
364 367
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
366 373
367#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
368 375
@@ -373,6 +380,7 @@ struct cfs_rq {
373 380
374 u64 exec_clock; 381 u64 exec_clock;
375 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
376 384
377 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -401,6 +409,31 @@ struct cfs_rq {
401 */ 409 */
402 struct list_head leaf_cfs_rq_list; 410 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
412
413#ifdef CONFIG_SMP
414 /*
415 * the part of load.weight contributed by tasks
416 */
417 unsigned long task_weight;
418
419 /*
420 * h_load = weight * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
426
427 /*
428 * this cpu's part of tg->shares
429 */
430 unsigned long shares;
431
432 /*
433 * load.weight at the time we set shares
434 */
435 unsigned long rq_weight;
436#endif
404#endif 437#endif
405}; 438};
406 439
@@ -452,6 +485,9 @@ struct root_domain {
452 */ 485 */
453 cpumask_t rto_mask; 486 cpumask_t rto_mask;
454 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
455}; 491};
456 492
457/* 493/*
@@ -526,14 +562,19 @@ struct rq {
526 int push_cpu; 562 int push_cpu;
527 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
528 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
529 568
530 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
531 struct list_head migration_queue; 570 struct list_head migration_queue;
532#endif 571#endif
533 572
534#ifdef CONFIG_SCHED_HRTICK 573#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags; 574#ifdef CONFIG_SMP
536 ktime_t hrtick_expire; 575 int hrtick_csd_pending;
576 struct call_single_data hrtick_csd;
577#endif
537 struct hrtimer hrtick_timer; 578 struct hrtimer hrtick_timer;
538#endif 579#endif
539 580
@@ -607,6 +648,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 648# define const_debug static const
608#endif 649#endif
609 650
651/**
652 * runqueue_is_locked
653 *
654 * Returns true if the current cpu runqueue is locked.
655 * This interface allows printk to be called with the runqueue lock
656 * held and know whether or not it is OK to wake up the klogd.
657 */
658int runqueue_is_locked(void)
659{
660 int cpu = get_cpu();
661 struct rq *rq = cpu_rq(cpu);
662 int ret;
663
664 ret = spin_is_locked(&rq->lock);
665 put_cpu();
666 return ret;
667}
668
610/* 669/*
611 * Debugging: various feature bits 670 * Debugging: various feature bits
612 */ 671 */
@@ -749,6 +808,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 808const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 809
751/* 810/*
811 * ratelimit for updating the group shares.
812 * default: 0.5ms
813 */
814const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
815
816/*
752 * period over which we measure -rt task cpu usage in us. 817 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 818 * default: 1s
754 */ 819 */
@@ -775,82 +840,6 @@ static inline u64 global_rt_runtime(void)
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 840 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 841}
777 842
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 843#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 844# define prepare_arch_switch(next) do { } while (0)
856#endif 845#endif
@@ -996,13 +985,6 @@ static struct rq *this_rq_lock(void)
996 return rq; 985 return rq;
997} 986}
998 987
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK 988#ifdef CONFIG_SCHED_HRTICK
1007/* 989/*
1008 * Use HR-timers to deliver accurate preemption points. 990 * Use HR-timers to deliver accurate preemption points.
@@ -1014,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
1014 * When we get rescheduled we reprogram the hrtick_timer outside of the 996 * When we get rescheduled we reprogram the hrtick_timer outside of the
1015 * rq->lock. 997 * rq->lock.
1016 */ 998 */
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET, /* re-programm hrtick_timer */
1033 HRTICK_RESET, /* not a new slice */
1034 HRTICK_BLOCK, /* stop hrtick operations */
1035};
1036 999
1037/* 1000/*
1038 * Use hrtick when: 1001 * Use hrtick when:
@@ -1043,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq)
1043{ 1006{
1044 if (!sched_feat(HRTICK)) 1007 if (!sched_feat(HRTICK))
1045 return 0; 1008 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) 1009 if (!cpu_active(cpu_of(rq)))
1047 return 0; 1010 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer); 1011 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049} 1012}
1050 1013
1051/*
1052 * Called to set the hrtick timer state.
1053 *
1054 * called with rq->lock held and irqs disabled
1055 */
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060 /*
1061 * preempt at: now + delay
1062 */
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065 /*
1066 * indicate we need to program the timer
1067 */
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072 /*
1073 * New slices are called from the schedule path and don't need a
1074 * forced reschedule.
1075 */
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq) 1014static void hrtick_clear(struct rq *rq)
1081{ 1015{
1082 if (hrtimer_active(&rq->hrtick_timer)) 1016 if (hrtimer_active(&rq->hrtick_timer))
@@ -1084,32 +1018,6 @@ static void hrtick_clear(struct rq *rq)
1084} 1018}
1085 1019
1086/* 1020/*
1087 * Update the timer from the possible pending state.
1088 */
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112/*
1113 * High-resolution timer tick. 1021 * High-resolution timer tick.
1114 * Runs from hardirq context with interrupts disabled. 1022 * Runs from hardirq context with interrupts disabled.
1115 */ 1023 */
@@ -1128,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1128} 1036}
1129 1037
1130#ifdef CONFIG_SMP 1038#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu) 1039/*
1040 * called from hardirq (IPI) context
1041 */
1042static void __hrtick_start(void *arg)
1132{ 1043{
1133 struct rq *rq = cpu_rq(cpu); 1044 struct rq *rq = arg;
1134 unsigned long flags;
1135
1136 spin_lock_irqsave(&rq->lock, flags);
1137 rq->hrtick_flags = 0;
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1139 spin_unlock_irqrestore(&rq->lock, flags);
1140 1045
1141 hrtick_clear(rq); 1046 spin_lock(&rq->lock);
1047 hrtimer_restart(&rq->hrtick_timer);
1048 rq->hrtick_csd_pending = 0;
1049 spin_unlock(&rq->lock);
1142} 1050}
1143 1051
1144static void hotplug_hrtick_enable(int cpu) 1052/*
1053 * Called to set the hrtick timer state.
1054 *
1055 * called with rq->lock held and irqs disabled
1056 */
1057static void hrtick_start(struct rq *rq, u64 delay)
1145{ 1058{
1146 struct rq *rq = cpu_rq(cpu); 1059 struct hrtimer *timer = &rq->hrtick_timer;
1147 unsigned long flags; 1060 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1148 1061
1149 spin_lock_irqsave(&rq->lock, flags); 1062 timer->expires = time;
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1063
1151 spin_unlock_irqrestore(&rq->lock, flags); 1064 if (rq == this_rq()) {
1065 hrtimer_restart(timer);
1066 } else if (!rq->hrtick_csd_pending) {
1067 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1068 rq->hrtick_csd_pending = 1;
1069 }
1152} 1070}
1153 1071
1154static int 1072static int
@@ -1163,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1163 case CPU_DOWN_PREPARE_FROZEN: 1081 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD: 1082 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN: 1083 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu); 1084 hrtick_clear(cpu_rq(cpu));
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK; 1085 return NOTIFY_OK;
1177 } 1086 }
1178 1087
@@ -1183,46 +1092,45 @@ static void init_hrtick(void)
1183{ 1092{
1184 hotcpu_notifier(hotplug_hrtick, 0); 1093 hotcpu_notifier(hotplug_hrtick, 0);
1185} 1094}
1186#endif /* CONFIG_SMP */ 1095#else
1096/*
1097 * Called to set the hrtick timer state.
1098 *
1099 * called with rq->lock held and irqs disabled
1100 */
1101static void hrtick_start(struct rq *rq, u64 delay)
1102{
1103 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1104}
1187 1105
1188static void init_rq_hrtick(struct rq *rq) 1106static void init_hrtick(void)
1189{ 1107{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194} 1108}
1109#endif /* CONFIG_SMP */
1195 1110
1196void hrtick_resched(void) 1111static void init_rq_hrtick(struct rq *rq)
1197{ 1112{
1198 struct rq *rq; 1113#ifdef CONFIG_SMP
1199 unsigned long flags; 1114 rq->hrtick_csd_pending = 0;
1200 1115
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED)) 1116 rq->hrtick_csd.flags = 0;
1202 return; 1117 rq->hrtick_csd.func = __hrtick_start;
1118 rq->hrtick_csd.info = rq;
1119#endif
1203 1120
1204 local_irq_save(flags); 1121 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1205 rq = cpu_rq(smp_processor_id()); 1122 rq->hrtick_timer.function = hrtick;
1206 hrtick_set(rq); 1123 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1207 local_irq_restore(flags);
1208} 1124}
1209#else 1125#else
1210static inline void hrtick_clear(struct rq *rq) 1126static inline void hrtick_clear(struct rq *rq)
1211{ 1127{
1212} 1128}
1213 1129
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq) 1130static inline void init_rq_hrtick(struct rq *rq)
1219{ 1131{
1220} 1132}
1221 1133
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void) 1134static inline void init_hrtick(void)
1227{ 1135{
1228} 1136}
@@ -1241,16 +1149,16 @@ static inline void init_hrtick(void)
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1149#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif 1150#endif
1243 1151
1244static void __resched_task(struct task_struct *p, int tif_bit) 1152static void resched_task(struct task_struct *p)
1245{ 1153{
1246 int cpu; 1154 int cpu;
1247 1155
1248 assert_spin_locked(&task_rq(p)->lock); 1156 assert_spin_locked(&task_rq(p)->lock);
1249 1157
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit))) 1158 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1251 return; 1159 return;
1252 1160
1253 set_tsk_thread_flag(p, tif_bit); 1161 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1254 1162
1255 cpu = task_cpu(p); 1163 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id()) 1164 if (cpu == smp_processor_id())
@@ -1313,15 +1221,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1221 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1222 smp_send_reschedule(cpu);
1315} 1223}
1316#endif 1224#endif /* CONFIG_NO_HZ */
1317 1225
1318#else 1226#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1227static void resched_task(struct task_struct *p)
1320{ 1228{
1321 assert_spin_locked(&task_rq(p)->lock); 1229 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1230 set_tsk_need_resched(p);
1323} 1231}
1324#endif 1232#endif /* CONFIG_SMP */
1325 1233
1326#if BITS_PER_LONG == 32 1234#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1235# define WMULT_CONST (~0UL)
@@ -1336,6 +1244,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1244 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1245#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1246
1247/*
1248 * delta *= weight / lw
1249 */
1339static unsigned long 1250static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1251calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1252 struct load_weight *lw)
@@ -1363,12 +1274,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1274 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1275}
1365 1276
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1277static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1278{
1374 lw->weight += inc; 1279 lw->weight += inc;
@@ -1479,17 +1384,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1479#ifdef CONFIG_SMP 1384#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1385static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1386static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1387static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1388
1389static unsigned long cpu_avg_load_per_task(int cpu)
1390{
1391 struct rq *rq = cpu_rq(cpu);
1392
1393 if (rq->nr_running)
1394 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1395
1396 return rq->avg_load_per_task;
1397}
1485 1398
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1399#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1400
1401typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1402
1403/*
1404 * Iterate the full tree, calling @down when first entering a node and @up when
1405 * leaving it for the final time.
1406 */
1407static void
1408walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1409{
1410 struct task_group *parent, *child;
1411
1412 rcu_read_lock();
1413 parent = &root_task_group;
1414down:
1415 (*down)(parent, cpu, sd);
1416 list_for_each_entry_rcu(child, &parent->children, siblings) {
1417 parent = child;
1418 goto down;
1419
1420up:
1421 continue;
1422 }
1423 (*up)(parent, cpu, sd);
1424
1425 child = parent;
1426 parent = parent->parent;
1427 if (parent)
1428 goto up;
1429 rcu_read_unlock();
1430}
1431
1432static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1433
1434/*
1435 * Calculate and set the cpu's group shares.
1436 */
1437static void
1438__update_group_shares_cpu(struct task_group *tg, int cpu,
1439 unsigned long sd_shares, unsigned long sd_rq_weight)
1440{
1441 int boost = 0;
1442 unsigned long shares;
1443 unsigned long rq_weight;
1444
1445 if (!tg->se[cpu])
1446 return;
1447
1448 rq_weight = tg->cfs_rq[cpu]->load.weight;
1449
1450 /*
1451 * If there are currently no tasks on the cpu pretend there is one of
1452 * average load so that when a new task gets to run here it will not
1453 * get delayed by group starvation.
1454 */
1455 if (!rq_weight) {
1456 boost = 1;
1457 rq_weight = NICE_0_LOAD;
1458 }
1459
1460 if (unlikely(rq_weight > sd_rq_weight))
1461 rq_weight = sd_rq_weight;
1462
1463 /*
1464 * \Sum shares * rq_weight
1465 * shares = -----------------------
1466 * \Sum rq_weight
1467 *
1468 */
1469 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1470
1471 /*
1472 * record the actual number of shares, not the boosted amount.
1473 */
1474 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1475 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1476
1477 if (shares < MIN_SHARES)
1478 shares = MIN_SHARES;
1479 else if (shares > MAX_SHARES)
1480 shares = MAX_SHARES;
1481
1482 __set_se_shares(tg->se[cpu], shares);
1483}
1484
1485/*
1486 * Re-compute the task group their per cpu shares over the given domain.
1487 * This needs to be done in a bottom-up fashion because the rq weight of a
1488 * parent group depends on the shares of its child groups.
1489 */
1490static void
1491tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1492{
1493 unsigned long rq_weight = 0;
1494 unsigned long shares = 0;
1495 int i;
1496
1497 for_each_cpu_mask(i, sd->span) {
1498 rq_weight += tg->cfs_rq[i]->load.weight;
1499 shares += tg->cfs_rq[i]->shares;
1500 }
1501
1502 if ((!shares && rq_weight) || shares > tg->shares)
1503 shares = tg->shares;
1504
1505 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1506 shares = tg->shares;
1507
1508 if (!rq_weight)
1509 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1510
1511 for_each_cpu_mask(i, sd->span) {
1512 struct rq *rq = cpu_rq(i);
1513 unsigned long flags;
1514
1515 spin_lock_irqsave(&rq->lock, flags);
1516 __update_group_shares_cpu(tg, i, shares, rq_weight);
1517 spin_unlock_irqrestore(&rq->lock, flags);
1518 }
1519}
1520
1521/*
1522 * Compute the cpu's hierarchical load factor for each task group.
1523 * This needs to be done in a top-down fashion because the load of a child
1524 * group is a fraction of its parents load.
1525 */
1526static void
1527tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1528{
1529 unsigned long load;
1530
1531 if (!tg->parent) {
1532 load = cpu_rq(cpu)->load.weight;
1533 } else {
1534 load = tg->parent->cfs_rq[cpu]->h_load;
1535 load *= tg->cfs_rq[cpu]->shares;
1536 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1537 }
1538
1539 tg->cfs_rq[cpu]->h_load = load;
1540}
1541
1542static void
1543tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1544{
1545}
1546
1547static void update_shares(struct sched_domain *sd)
1548{
1549 u64 now = cpu_clock(raw_smp_processor_id());
1550 s64 elapsed = now - sd->last_update;
1551
1552 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1553 sd->last_update = now;
1554 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1555 }
1556}
1557
1558static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1559{
1560 spin_unlock(&rq->lock);
1561 update_shares(sd);
1562 spin_lock(&rq->lock);
1563}
1564
1565static void update_h_load(int cpu)
1566{
1567 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1568}
1569
1570#else
1571
1572static inline void update_shares(struct sched_domain *sd)
1573{
1574}
1575
1576static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1488{ 1577{
1489} 1578}
1579
1490#endif 1580#endif
1491 1581
1492#endif /* CONFIG_SMP */ 1582#endif
1583
1584#ifdef CONFIG_FAIR_GROUP_SCHED
1585static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1586{
1587#ifdef CONFIG_SMP
1588 cfs_rq->shares = shares;
1589#endif
1590}
1591#endif
1493 1592
1494#include "sched_stats.h" 1593#include "sched_stats.h"
1495#include "sched_idletask.c" 1594#include "sched_idletask.c"
@@ -1500,27 +1599,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1599#endif
1501 1600
1502#define sched_class_highest (&rt_sched_class) 1601#define sched_class_highest (&rt_sched_class)
1602#define for_each_class(class) \
1603 for (class = sched_class_highest; class; class = class->next)
1503 1604
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1605static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1606{
1516 rq->nr_running++; 1607 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1608}
1519 1609
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1610static void dec_nr_running(struct rq *rq)
1521{ 1611{
1522 rq->nr_running--; 1612 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1613}
1525 1614
1526static void set_load_weight(struct task_struct *p) 1615static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1633,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1633 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1634}
1546 1635
1636static void update_avg(u64 *avg, u64 sample)
1637{
1638 s64 diff = sample - *avg;
1639 *avg += diff >> 3;
1640}
1641
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1642static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1643{
1549 sched_info_queued(p); 1644 sched_info_queued(p);
@@ -1553,6 +1648,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1648
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1649static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1650{
1651 if (sleep && p->se.last_wakeup) {
1652 update_avg(&p->se.avg_overlap,
1653 p->se.sum_exec_runtime - p->se.last_wakeup);
1654 p->se.last_wakeup = 0;
1655 }
1656
1657 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1658 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1659 p->se.on_rq = 0;
1558} 1660}
@@ -1612,7 +1714,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1714 rq->nr_uninterruptible--;
1613 1715
1614 enqueue_task(rq, p, wakeup); 1716 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1717 inc_nr_running(rq);
1616} 1718}
1617 1719
1618/* 1720/*
@@ -1624,7 +1726,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1726 rq->nr_uninterruptible++;
1625 1727
1626 dequeue_task(rq, p, sleep); 1728 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1729 dec_nr_running(rq);
1628} 1730}
1629 1731
1630/** 1732/**
@@ -1636,12 +1738,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1738 return cpu_curr(task_cpu(p)) == p;
1637} 1739}
1638 1740
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1741static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1742{
1647 set_task_rq(p, cpu); 1743 set_task_rq(p, cpu);
@@ -1670,6 +1766,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1766
1671#ifdef CONFIG_SMP 1767#ifdef CONFIG_SMP
1672 1768
1769/* Used instead of source_load when we know the type == 0 */
1770static unsigned long weighted_cpuload(const int cpu)
1771{
1772 return cpu_rq(cpu)->load.weight;
1773}
1774
1673/* 1775/*
1674 * Is this task likely cache-hot: 1776 * Is this task likely cache-hot:
1675 */ 1777 */
@@ -1880,7 +1982,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 1982 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 1983 unsigned long total = weighted_cpuload(cpu);
1882 1984
1883 if (type == 0) 1985 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 1986 return total;
1885 1987
1886 return min(rq->cpu_load[type-1], total); 1988 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +1997,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 1997 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 1998 unsigned long total = weighted_cpuload(cpu);
1897 1999
1898 if (type == 0) 2000 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2001 return total;
1900 2002
1901 return max(rq->cpu_load[type-1], total); 2003 return max(rq->cpu_load[type-1], total);
1902} 2004}
1903 2005
1904/* 2006/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2007 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2008 * domain.
1919 */ 2009 */
@@ -1939,7 +2029,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1939 /* Tally up the load of all CPUs in the group */ 2029 /* Tally up the load of all CPUs in the group */
1940 avg_load = 0; 2030 avg_load = 0;
1941 2031
1942 for_each_cpu_mask(i, group->cpumask) { 2032 for_each_cpu_mask_nr(i, group->cpumask) {
1943 /* Bias balancing toward cpus of our domain */ 2033 /* Bias balancing toward cpus of our domain */
1944 if (local_group) 2034 if (local_group)
1945 load = source_load(i, load_idx); 2035 load = source_load(i, load_idx);
@@ -1981,7 +2071,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1981 /* Traverse only the allowed CPUs */ 2071 /* Traverse only the allowed CPUs */
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2072 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983 2073
1984 for_each_cpu_mask(i, *tmp) { 2074 for_each_cpu_mask_nr(i, *tmp) {
1985 load = weighted_cpuload(i); 2075 load = weighted_cpuload(i);
1986 2076
1987 if (load < min_load || (load == min_load && i == this_cpu)) { 2077 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2019,6 +2109,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2109 sd = tmp;
2020 } 2110 }
2021 2111
2112 if (sd)
2113 update_shares(sd);
2114
2022 while (sd) { 2115 while (sd) {
2023 cpumask_t span, tmpmask; 2116 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2117 struct sched_group *group;
@@ -2085,6 +2178,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2178 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2179 sync = 0;
2087 2180
2181#ifdef CONFIG_SMP
2182 if (sched_feat(LB_WAKEUP_UPDATE)) {
2183 struct sched_domain *sd;
2184
2185 this_cpu = raw_smp_processor_id();
2186 cpu = task_cpu(p);
2187
2188 for_each_domain(this_cpu, sd) {
2189 if (cpu_isset(cpu, sd->span)) {
2190 update_shares(sd);
2191 break;
2192 }
2193 }
2194 }
2195#endif
2196
2088 smp_wmb(); 2197 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2198 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2199 old_state = p->state;
@@ -2131,7 +2240,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2240 }
2132 } 2241 }
2133 } 2242 }
2134#endif 2243#endif /* CONFIG_SCHEDSTATS */
2135 2244
2136out_activate: 2245out_activate:
2137#endif /* CONFIG_SMP */ 2246#endif /* CONFIG_SMP */
@@ -2149,6 +2258,9 @@ out_activate:
2149 success = 1; 2258 success = 1;
2150 2259
2151out_running: 2260out_running:
2261 trace_mark(kernel_sched_wakeup,
2262 "pid %d state %ld ## rq %p task %p rq->curr %p",
2263 p->pid, p->state, rq, p, rq->curr);
2152 check_preempt_curr(rq, p); 2264 check_preempt_curr(rq, p);
2153 2265
2154 p->state = TASK_RUNNING; 2266 p->state = TASK_RUNNING;
@@ -2157,6 +2269,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2269 p->sched_class->task_wake_up(rq, p);
2158#endif 2270#endif
2159out: 2271out:
2272 current->se.last_wakeup = current->se.sum_exec_runtime;
2273
2160 task_rq_unlock(rq, &flags); 2274 task_rq_unlock(rq, &flags);
2161 2275
2162 return success; 2276 return success;
@@ -2277,8 +2391,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2391 * management (if any):
2278 */ 2392 */
2279 p->sched_class->task_new(rq, p); 2393 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2394 inc_nr_running(rq);
2281 } 2395 }
2396 trace_mark(kernel_sched_wakeup_new,
2397 "pid %d state %ld ## rq %p task %p rq->curr %p",
2398 p->pid, p->state, rq, p, rq->curr);
2282 check_preempt_curr(rq, p); 2399 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP 2400#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up) 2401 if (p->sched_class->task_wake_up)
@@ -2331,7 +2448,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2448 notifier->ops->sched_out(notifier, next);
2332} 2449}
2333 2450
2334#else 2451#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2452
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2453static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2454{
@@ -2343,7 +2460,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2460{
2344} 2461}
2345 2462
2346#endif 2463#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2464
2348/** 2465/**
2349 * prepare_task_switch - prepare to switch tasks 2466 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2568,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2451 struct mm_struct *mm, *oldmm; 2568 struct mm_struct *mm, *oldmm;
2452 2569
2453 prepare_task_switch(rq, prev, next); 2570 prepare_task_switch(rq, prev, next);
2571 trace_mark(kernel_sched_schedule,
2572 "prev_pid %d next_pid %d prev_state %ld "
2573 "## rq %p prev %p next %p",
2574 prev->pid, next->pid, prev->state,
2575 rq, prev, next);
2454 mm = next->mm; 2576 mm = next->mm;
2455 oldmm = prev->active_mm; 2577 oldmm = prev->active_mm;
2456 /* 2578 /*
@@ -2680,7 +2802,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2680 2802
2681 rq = task_rq_lock(p, &flags); 2803 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2804 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu))) 2805 || unlikely(!cpu_active(dest_cpu)))
2684 goto out; 2806 goto out;
2685 2807
2686 /* force the process onto the specified CPU */ 2808 /* force the process onto the specified CPU */
@@ -2785,7 +2907,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2907 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2908 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2909{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2910 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2911 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2912 long rem_load_move = max_load_move;
2791 2913
@@ -2801,14 +2923,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 2923next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 2924 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 2925 goto out;
2804 /* 2926
2805 * To help distribute high priority tasks across CPUs we don't 2927 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2928 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 2929 p = iterator->next(iterator->arg);
2814 goto next; 2930 goto next;
@@ -2863,6 +2979,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 2979 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 2980 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 2981 class = class->next;
2982
2983 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2984 break;
2985
2866 } while (class && max_load_move > total_load_moved); 2986 } while (class && max_load_move > total_load_moved);
2867 2987
2868 return total_load_moved > 0; 2988 return total_load_moved > 0;
@@ -2939,6 +3059,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3059 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3060 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3061 this_load_per_task = this_nr_running = 0;
3062
2942 if (idle == CPU_NOT_IDLE) 3063 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3064 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3065 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3074,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3074 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3075 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3076 unsigned long sum_nr_running, sum_weighted_load;
3077 unsigned long sum_avg_load_per_task;
3078 unsigned long avg_load_per_task;
2956 3079
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3080 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3081
@@ -2961,10 +3084,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3084
2962 /* Tally up the load of all CPUs in the group */ 3085 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3086 sum_weighted_load = sum_nr_running = avg_load = 0;
3087 sum_avg_load_per_task = avg_load_per_task = 0;
3088
2964 max_cpu_load = 0; 3089 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3090 min_cpu_load = ~0UL;
2966 3091
2967 for_each_cpu_mask(i, group->cpumask) { 3092 for_each_cpu_mask_nr(i, group->cpumask) {
2968 struct rq *rq; 3093 struct rq *rq;
2969 3094
2970 if (!cpu_isset(i, *cpus)) 3095 if (!cpu_isset(i, *cpus))
@@ -2994,6 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3119 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3120 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3121 sum_weighted_load += weighted_cpuload(i);
3122
3123 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3124 }
2998 3125
2999 /* 3126 /*
@@ -3015,7 +3142,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3142 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3143 avg_load * SCHED_LOAD_SCALE);
3017 3144
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3145
3146 /*
3147 * Consider the group unbalanced when the imbalance is larger
3148 * than the average weight of two tasks.
3149 *
3150 * APZ: with cgroup the avg task weight can vary wildly and
3151 * might not be a suitable number - should we keep a
3152 * normalized nr_running number somewhere that negates
3153 * the hierarchy?
3154 */
3155 avg_load_per_task = sg_div_cpu_power(group,
3156 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3157
3158 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3159 __group_imb = 1;
3020 3160
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3161 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3296,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3296 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3297 imbn = 1;
3158 } else 3298 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3299 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3300
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3301 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3302 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3303 *imbalance = busiest_load_per_task;
3164 return busiest; 3304 return busiest;
@@ -3228,7 +3368,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3228 unsigned long max_load = 0; 3368 unsigned long max_load = 0;
3229 int i; 3369 int i;
3230 3370
3231 for_each_cpu_mask(i, group->cpumask) { 3371 for_each_cpu_mask_nr(i, group->cpumask) {
3232 unsigned long wl; 3372 unsigned long wl;
3233 3373
3234 if (!cpu_isset(i, *cpus)) 3374 if (!cpu_isset(i, *cpus))
@@ -3284,6 +3424,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3424 schedstat_inc(sd, lb_count[idle]);
3285 3425
3286redo: 3426redo:
3427 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3428 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3429 cpus, balance);
3289 3430
@@ -3386,8 +3527,9 @@ redo:
3386 3527
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3528 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3529 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3530 ld_moved = -1;
3390 return ld_moved; 3531
3532 goto out;
3391 3533
3392out_balanced: 3534out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3535 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3544,13 @@ out_one_pinned:
3402 3544
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3545 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3546 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3547 ld_moved = -1;
3406 return 0; 3548 else
3549 ld_moved = 0;
3550out:
3551 if (ld_moved)
3552 update_shares(sd);
3553 return ld_moved;
3407} 3554}
3408 3555
3409/* 3556/*
@@ -3438,6 +3585,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3585
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3586 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3587redo:
3588 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3589 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3590 &sd_idle, cpus, NULL);
3443 if (!group) { 3591 if (!group) {
@@ -3481,6 +3629,7 @@ redo:
3481 } else 3629 } else
3482 sd->nr_balance_failed = 0; 3630 sd->nr_balance_failed = 0;
3483 3631
3632 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3633 return ld_moved;
3485 3634
3486out_balanced: 3635out_balanced:
@@ -3621,7 +3770,7 @@ int select_nohz_load_balancer(int stop_tick)
3621 /* 3770 /*
3622 * If we are going offline and still the leader, give up! 3771 * If we are going offline and still the leader, give up!
3623 */ 3772 */
3624 if (cpu_is_offline(cpu) && 3773 if (!cpu_active(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) { 3774 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3775 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG(); 3776 BUG();
@@ -3672,6 +3821,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3821 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3822 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3823 int update_next_balance = 0;
3824 int need_serialize;
3675 cpumask_t tmp; 3825 cpumask_t tmp;
3676 3826
3677 for_each_domain(cpu, sd) { 3827 for_each_domain(cpu, sd) {
@@ -3689,8 +3839,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3839 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3840 interval = HZ*NR_CPUS/10;
3691 3841
3842 need_serialize = sd->flags & SD_SERIALIZE;
3692 3843
3693 if (sd->flags & SD_SERIALIZE) { 3844 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3845 if (!spin_trylock(&balancing))
3695 goto out; 3846 goto out;
3696 } 3847 }
@@ -3706,7 +3857,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3857 }
3707 sd->last_balance = jiffies; 3858 sd->last_balance = jiffies;
3708 } 3859 }
3709 if (sd->flags & SD_SERIALIZE) 3860 if (need_serialize)
3710 spin_unlock(&balancing); 3861 spin_unlock(&balancing);
3711out: 3862out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3863 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3759,7 +3910,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3759 int balance_cpu; 3910 int balance_cpu;
3760 3911
3761 cpu_clear(this_cpu, cpus); 3912 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) { 3913 for_each_cpu_mask_nr(balance_cpu, cpus) {
3763 /* 3914 /*
3764 * If this cpu gets work to do, stop the load balancing 3915 * If this cpu gets work to do, stop the load balancing
3765 * work being done for other cpus. Next load 3916 * work being done for other cpus. Next load
@@ -4021,26 +4172,44 @@ void scheduler_tick(void)
4021#endif 4172#endif
4022} 4173}
4023 4174
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4175#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4176 defined(CONFIG_PREEMPT_TRACER))
4177
4178static inline unsigned long get_parent_ip(unsigned long addr)
4179{
4180 if (in_lock_functions(addr)) {
4181 addr = CALLER_ADDR2;
4182 if (in_lock_functions(addr))
4183 addr = CALLER_ADDR3;
4184 }
4185 return addr;
4186}
4025 4187
4026void __kprobes add_preempt_count(int val) 4188void __kprobes add_preempt_count(int val)
4027{ 4189{
4190#ifdef CONFIG_DEBUG_PREEMPT
4028 /* 4191 /*
4029 * Underflow? 4192 * Underflow?
4030 */ 4193 */
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4194 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return; 4195 return;
4196#endif
4033 preempt_count() += val; 4197 preempt_count() += val;
4198#ifdef CONFIG_DEBUG_PREEMPT
4034 /* 4199 /*
4035 * Spinlock count overflowing soon? 4200 * Spinlock count overflowing soon?
4036 */ 4201 */
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4202 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10); 4203 PREEMPT_MASK - 10);
4204#endif
4205 if (preempt_count() == val)
4206 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4039} 4207}
4040EXPORT_SYMBOL(add_preempt_count); 4208EXPORT_SYMBOL(add_preempt_count);
4041 4209
4042void __kprobes sub_preempt_count(int val) 4210void __kprobes sub_preempt_count(int val)
4043{ 4211{
4212#ifdef CONFIG_DEBUG_PREEMPT
4044 /* 4213 /*
4045 * Underflow? 4214 * Underflow?
4046 */ 4215 */
@@ -4052,7 +4221,10 @@ void __kprobes sub_preempt_count(int val)
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4221 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK))) 4222 !(preempt_count() & PREEMPT_MASK)))
4054 return; 4223 return;
4224#endif
4055 4225
4226 if (preempt_count() == val)
4227 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4056 preempt_count() -= val; 4228 preempt_count() -= val;
4057} 4229}
4058EXPORT_SYMBOL(sub_preempt_count); 4230EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4242,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4242 prev->comm, prev->pid, preempt_count());
4071 4243
4072 debug_show_held_locks(prev); 4244 debug_show_held_locks(prev);
4245 print_modules();
4073 if (irqs_disabled()) 4246 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4247 print_irqtrace_events(prev);
4075 4248
@@ -4158,7 +4331,8 @@ need_resched_nonpreemptible:
4158 4331
4159 schedule_debug(prev); 4332 schedule_debug(prev);
4160 4333
4161 hrtick_clear(rq); 4334 if (sched_feat(HRTICK))
4335 hrtick_clear(rq);
4162 4336
4163 /* 4337 /*
4164 * Do the rq-clock update outside the rq lock: 4338 * Do the rq-clock update outside the rq lock:
@@ -4204,8 +4378,6 @@ need_resched_nonpreemptible:
4204 } else 4378 } else
4205 spin_unlock_irq(&rq->lock); 4379 spin_unlock_irq(&rq->lock);
4206 4380
4207 hrtick_set(rq);
4208
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4381 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4382 goto need_resched_nonpreemptible;
4211 4383
@@ -4586,10 +4758,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4758 goto out_unlock;
4587 } 4759 }
4588 on_rq = p->se.on_rq; 4760 on_rq = p->se.on_rq;
4589 if (on_rq) { 4761 if (on_rq)
4590 dequeue_task(rq, p, 0); 4762 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4763
4594 p->static_prio = NICE_TO_PRIO(nice); 4764 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4765 set_load_weight(p);
@@ -4599,7 +4769,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4769
4600 if (on_rq) { 4770 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4771 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4772 /*
4604 * If the task increased its priority or is running and 4773 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4774 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +4913,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 4913 set_load_weight(p);
4745} 4914}
4746 4915
4747/** 4916static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4917 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 4918{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 4919 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 4920 unsigned long flags;
@@ -4785,7 +4946,7 @@ recheck:
4785 /* 4946 /*
4786 * Allow unprivileged RT tasks to decrease priority: 4947 * Allow unprivileged RT tasks to decrease priority:
4787 */ 4948 */
4788 if (!capable(CAP_SYS_NICE)) { 4949 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 4950 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 4951 unsigned long rlim_rtprio;
4791 4952
@@ -4821,7 +4982,8 @@ recheck:
4821 * Do not allow realtime tasks into groups that have no runtime 4982 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 4983 * assigned.
4823 */ 4984 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 4985 if (user
4986 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4825 return -EPERM; 4987 return -EPERM;
4826#endif 4988#endif
4827 4989
@@ -4870,8 +5032,39 @@ recheck:
4870 5032
4871 return 0; 5033 return 0;
4872} 5034}
5035
5036/**
5037 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5038 * @p: the task in question.
5039 * @policy: new policy.
5040 * @param: structure containing the new RT priority.
5041 *
5042 * NOTE that the task may be already dead.
5043 */
5044int sched_setscheduler(struct task_struct *p, int policy,
5045 struct sched_param *param)
5046{
5047 return __sched_setscheduler(p, policy, param, true);
5048}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5049EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5050
5051/**
5052 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5053 * @p: the task in question.
5054 * @policy: new policy.
5055 * @param: structure containing the new RT priority.
5056 *
5057 * Just like sched_setscheduler, only don't bother checking if the
5058 * current context has permission. For example, this is needed in
5059 * stop_machine(): we create temporary high priority worker threads,
5060 * but our caller might not have that capability.
5061 */
5062int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5063 struct sched_param *param)
5064{
5065 return __sched_setscheduler(p, policy, param, false);
5066}
5067
4875static int 5068static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5069do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5070{
@@ -5070,24 +5263,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5263 return sched_setaffinity(pid, &new_mask);
5071} 5264}
5072 5265
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5266long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5267{
5093 struct task_struct *p; 5268 struct task_struct *p;
@@ -5384,7 +5559,7 @@ out_unlock:
5384 return retval; 5559 return retval;
5385} 5560}
5386 5561
5387static const char stat_nam[] = "RSDTtZX"; 5562static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5563
5389void sched_show_task(struct task_struct *p) 5564void sched_show_task(struct task_struct *p)
5390{ 5565{
@@ -5571,6 +5746,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5746 goto out;
5572 } 5747 }
5573 5748
5749 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5750 !cpus_equal(p->cpus_allowed, *new_mask))) {
5751 ret = -EINVAL;
5752 goto out;
5753 }
5754
5574 if (p->sched_class->set_cpus_allowed) 5755 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5756 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5757 else {
@@ -5613,7 +5794,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5613 struct rq *rq_dest, *rq_src; 5794 struct rq *rq_dest, *rq_src;
5614 int ret = 0, on_rq; 5795 int ret = 0, on_rq;
5615 5796
5616 if (unlikely(cpu_is_offline(dest_cpu))) 5797 if (unlikely(!cpu_active(dest_cpu)))
5617 return ret; 5798 return ret;
5618 5799
5619 rq_src = cpu_rq(src_cpu); 5800 rq_src = cpu_rq(src_cpu);
@@ -6060,6 +6241,36 @@ static void unregister_sched_domain_sysctl(void)
6060} 6241}
6061#endif 6242#endif
6062 6243
6244static void set_rq_online(struct rq *rq)
6245{
6246 if (!rq->online) {
6247 const struct sched_class *class;
6248
6249 cpu_set(rq->cpu, rq->rd->online);
6250 rq->online = 1;
6251
6252 for_each_class(class) {
6253 if (class->rq_online)
6254 class->rq_online(rq);
6255 }
6256 }
6257}
6258
6259static void set_rq_offline(struct rq *rq)
6260{
6261 if (rq->online) {
6262 const struct sched_class *class;
6263
6264 for_each_class(class) {
6265 if (class->rq_offline)
6266 class->rq_offline(rq);
6267 }
6268
6269 cpu_clear(rq->cpu, rq->rd->online);
6270 rq->online = 0;
6271 }
6272}
6273
6063/* 6274/*
6064 * migration_call - callback that gets triggered when a CPU is added. 6275 * migration_call - callback that gets triggered when a CPU is added.
6065 * Here we can start up the necessary migration thread for the new CPU. 6276 * Here we can start up the necessary migration thread for the new CPU.
@@ -6097,7 +6308,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6097 spin_lock_irqsave(&rq->lock, flags); 6308 spin_lock_irqsave(&rq->lock, flags);
6098 if (rq->rd) { 6309 if (rq->rd) {
6099 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6310 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6100 cpu_set(cpu, rq->rd->online); 6311
6312 set_rq_online(rq);
6101 } 6313 }
6102 spin_unlock_irqrestore(&rq->lock, flags); 6314 spin_unlock_irqrestore(&rq->lock, flags);
6103 break; 6315 break;
@@ -6158,7 +6370,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6158 spin_lock_irqsave(&rq->lock, flags); 6370 spin_lock_irqsave(&rq->lock, flags);
6159 if (rq->rd) { 6371 if (rq->rd) {
6160 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6372 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6161 cpu_clear(cpu, rq->rd->online); 6373 set_rq_offline(rq);
6162 } 6374 }
6163 spin_unlock_irqrestore(&rq->lock, flags); 6375 spin_unlock_irqrestore(&rq->lock, flags);
6164 break; 6376 break;
@@ -6192,6 +6404,28 @@ void __init migration_init(void)
6192 6404
6193#ifdef CONFIG_SCHED_DEBUG 6405#ifdef CONFIG_SCHED_DEBUG
6194 6406
6407static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6408{
6409 switch (lvl) {
6410 case SD_LV_NONE:
6411 return "NONE";
6412 case SD_LV_SIBLING:
6413 return "SIBLING";
6414 case SD_LV_MC:
6415 return "MC";
6416 case SD_LV_CPU:
6417 return "CPU";
6418 case SD_LV_NODE:
6419 return "NODE";
6420 case SD_LV_ALLNODES:
6421 return "ALLNODES";
6422 case SD_LV_MAX:
6423 return "MAX";
6424
6425 }
6426 return "MAX";
6427}
6428
6195static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6429static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6196 cpumask_t *groupmask) 6430 cpumask_t *groupmask)
6197{ 6431{
@@ -6211,7 +6445,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6211 return -1; 6445 return -1;
6212 } 6446 }
6213 6447
6214 printk(KERN_CONT "span %s\n", str); 6448 printk(KERN_CONT "span %s level %s\n",
6449 str, sd_level_to_string(sd->level));
6215 6450
6216 if (!cpu_isset(cpu, sd->span)) { 6451 if (!cpu_isset(cpu, sd->span)) {
6217 printk(KERN_ERR "ERROR: domain->span does not contain " 6452 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6295,9 +6530,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6295 } 6530 }
6296 kfree(groupmask); 6531 kfree(groupmask);
6297} 6532}
6298#else 6533#else /* !CONFIG_SCHED_DEBUG */
6299# define sched_domain_debug(sd, cpu) do { } while (0) 6534# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif 6535#endif /* CONFIG_SCHED_DEBUG */
6301 6536
6302static int sd_degenerate(struct sched_domain *sd) 6537static int sd_degenerate(struct sched_domain *sd)
6303{ 6538{
@@ -6357,20 +6592,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6357static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6592static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6358{ 6593{
6359 unsigned long flags; 6594 unsigned long flags;
6360 const struct sched_class *class;
6361 6595
6362 spin_lock_irqsave(&rq->lock, flags); 6596 spin_lock_irqsave(&rq->lock, flags);
6363 6597
6364 if (rq->rd) { 6598 if (rq->rd) {
6365 struct root_domain *old_rd = rq->rd; 6599 struct root_domain *old_rd = rq->rd;
6366 6600
6367 for (class = sched_class_highest; class; class = class->next) { 6601 if (cpu_isset(rq->cpu, old_rd->online))
6368 if (class->leave_domain) 6602 set_rq_offline(rq);
6369 class->leave_domain(rq);
6370 }
6371 6603
6372 cpu_clear(rq->cpu, old_rd->span); 6604 cpu_clear(rq->cpu, old_rd->span);
6373 cpu_clear(rq->cpu, old_rd->online);
6374 6605
6375 if (atomic_dec_and_test(&old_rd->refcount)) 6606 if (atomic_dec_and_test(&old_rd->refcount))
6376 kfree(old_rd); 6607 kfree(old_rd);
@@ -6381,12 +6612,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6381 6612
6382 cpu_set(rq->cpu, rd->span); 6613 cpu_set(rq->cpu, rd->span);
6383 if (cpu_isset(rq->cpu, cpu_online_map)) 6614 if (cpu_isset(rq->cpu, cpu_online_map))
6384 cpu_set(rq->cpu, rd->online); 6615 set_rq_online(rq);
6385
6386 for (class = sched_class_highest; class; class = class->next) {
6387 if (class->join_domain)
6388 class->join_domain(rq);
6389 }
6390 6616
6391 spin_unlock_irqrestore(&rq->lock, flags); 6617 spin_unlock_irqrestore(&rq->lock, flags);
6392} 6618}
@@ -6397,6 +6623,8 @@ static void init_rootdomain(struct root_domain *rd)
6397 6623
6398 cpus_clear(rd->span); 6624 cpus_clear(rd->span);
6399 cpus_clear(rd->online); 6625 cpus_clear(rd->online);
6626
6627 cpupri_init(&rd->cpupri);
6400} 6628}
6401 6629
6402static void init_defrootdomain(void) 6630static void init_defrootdomain(void)
@@ -6458,7 +6686,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6458/* Setup the mask of cpus configured for isolated domains */ 6686/* Setup the mask of cpus configured for isolated domains */
6459static int __init isolated_cpu_setup(char *str) 6687static int __init isolated_cpu_setup(char *str)
6460{ 6688{
6461 int ints[NR_CPUS], i; 6689 static int __initdata ints[NR_CPUS];
6690 int i;
6462 6691
6463 str = get_options(str, ARRAY_SIZE(ints), ints); 6692 str = get_options(str, ARRAY_SIZE(ints), ints);
6464 cpus_clear(cpu_isolated_map); 6693 cpus_clear(cpu_isolated_map);
@@ -6492,7 +6721,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6492 6721
6493 cpus_clear(*covered); 6722 cpus_clear(*covered);
6494 6723
6495 for_each_cpu_mask(i, *span) { 6724 for_each_cpu_mask_nr(i, *span) {
6496 struct sched_group *sg; 6725 struct sched_group *sg;
6497 int group = group_fn(i, cpu_map, &sg, tmpmask); 6726 int group = group_fn(i, cpu_map, &sg, tmpmask);
6498 int j; 6727 int j;
@@ -6503,7 +6732,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6503 cpus_clear(sg->cpumask); 6732 cpus_clear(sg->cpumask);
6504 sg->__cpu_power = 0; 6733 sg->__cpu_power = 0;
6505 6734
6506 for_each_cpu_mask(j, *span) { 6735 for_each_cpu_mask_nr(j, *span) {
6507 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6736 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6508 continue; 6737 continue;
6509 6738
@@ -6539,9 +6768,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6539 6768
6540 min_val = INT_MAX; 6769 min_val = INT_MAX;
6541 6770
6542 for (i = 0; i < MAX_NUMNODES; i++) { 6771 for (i = 0; i < nr_node_ids; i++) {
6543 /* Start at @node */ 6772 /* Start at @node */
6544 n = (node + i) % MAX_NUMNODES; 6773 n = (node + i) % nr_node_ids;
6545 6774
6546 if (!nr_cpus_node(n)) 6775 if (!nr_cpus_node(n))
6547 continue; 6776 continue;
@@ -6591,7 +6820,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6591 cpus_or(*span, *span, *nodemask); 6820 cpus_or(*span, *span, *nodemask);
6592 } 6821 }
6593} 6822}
6594#endif 6823#endif /* CONFIG_NUMA */
6595 6824
6596int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6825int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6597 6826
@@ -6610,7 +6839,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6610 *sg = &per_cpu(sched_group_cpus, cpu); 6839 *sg = &per_cpu(sched_group_cpus, cpu);
6611 return cpu; 6840 return cpu;
6612} 6841}
6613#endif 6842#endif /* CONFIG_SCHED_SMT */
6614 6843
6615/* 6844/*
6616 * multi-core sched-domains: 6845 * multi-core sched-domains:
@@ -6618,7 +6847,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6618#ifdef CONFIG_SCHED_MC 6847#ifdef CONFIG_SCHED_MC
6619static DEFINE_PER_CPU(struct sched_domain, core_domains); 6848static DEFINE_PER_CPU(struct sched_domain, core_domains);
6620static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6849static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6621#endif 6850#endif /* CONFIG_SCHED_MC */
6622 6851
6623#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6852#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6624static int 6853static int
@@ -6703,7 +6932,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6703 if (!sg) 6932 if (!sg)
6704 return; 6933 return;
6705 do { 6934 do {
6706 for_each_cpu_mask(j, sg->cpumask) { 6935 for_each_cpu_mask_nr(j, sg->cpumask) {
6707 struct sched_domain *sd; 6936 struct sched_domain *sd;
6708 6937
6709 sd = &per_cpu(phys_domains, j); 6938 sd = &per_cpu(phys_domains, j);
@@ -6720,7 +6949,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6720 sg = sg->next; 6949 sg = sg->next;
6721 } while (sg != group_head); 6950 } while (sg != group_head);
6722} 6951}
6723#endif 6952#endif /* CONFIG_NUMA */
6724 6953
6725#ifdef CONFIG_NUMA 6954#ifdef CONFIG_NUMA
6726/* Free memory allocated for various sched_group structures */ 6955/* Free memory allocated for various sched_group structures */
@@ -6728,14 +6957,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6728{ 6957{
6729 int cpu, i; 6958 int cpu, i;
6730 6959
6731 for_each_cpu_mask(cpu, *cpu_map) { 6960 for_each_cpu_mask_nr(cpu, *cpu_map) {
6732 struct sched_group **sched_group_nodes 6961 struct sched_group **sched_group_nodes
6733 = sched_group_nodes_bycpu[cpu]; 6962 = sched_group_nodes_bycpu[cpu];
6734 6963
6735 if (!sched_group_nodes) 6964 if (!sched_group_nodes)
6736 continue; 6965 continue;
6737 6966
6738 for (i = 0; i < MAX_NUMNODES; i++) { 6967 for (i = 0; i < nr_node_ids; i++) {
6739 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6968 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6740 6969
6741 *nodemask = node_to_cpumask(i); 6970 *nodemask = node_to_cpumask(i);
@@ -6757,11 +6986,11 @@ next_sg:
6757 sched_group_nodes_bycpu[cpu] = NULL; 6986 sched_group_nodes_bycpu[cpu] = NULL;
6758 } 6987 }
6759} 6988}
6760#else 6989#else /* !CONFIG_NUMA */
6761static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 6990static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6762{ 6991{
6763} 6992}
6764#endif 6993#endif /* CONFIG_NUMA */
6765 6994
6766/* 6995/*
6767 * Initialize sched groups cpu_power. 6996 * Initialize sched groups cpu_power.
@@ -6928,7 +7157,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6928 /* 7157 /*
6929 * Allocate the per-node list of sched groups 7158 * Allocate the per-node list of sched groups
6930 */ 7159 */
6931 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7160 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6932 GFP_KERNEL); 7161 GFP_KERNEL);
6933 if (!sched_group_nodes) { 7162 if (!sched_group_nodes) {
6934 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7163 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6967,7 +7196,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6967 /* 7196 /*
6968 * Set up domains for cpus specified by the cpu_map. 7197 * Set up domains for cpus specified by the cpu_map.
6969 */ 7198 */
6970 for_each_cpu_mask(i, *cpu_map) { 7199 for_each_cpu_mask_nr(i, *cpu_map) {
6971 struct sched_domain *sd = NULL, *p; 7200 struct sched_domain *sd = NULL, *p;
6972 SCHED_CPUMASK_VAR(nodemask, allmasks); 7201 SCHED_CPUMASK_VAR(nodemask, allmasks);
6973 7202
@@ -7034,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7034 7263
7035#ifdef CONFIG_SCHED_SMT 7264#ifdef CONFIG_SCHED_SMT
7036 /* Set up CPU (sibling) groups */ 7265 /* Set up CPU (sibling) groups */
7037 for_each_cpu_mask(i, *cpu_map) { 7266 for_each_cpu_mask_nr(i, *cpu_map) {
7038 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7267 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7039 SCHED_CPUMASK_VAR(send_covered, allmasks); 7268 SCHED_CPUMASK_VAR(send_covered, allmasks);
7040 7269
@@ -7051,7 +7280,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7051 7280
7052#ifdef CONFIG_SCHED_MC 7281#ifdef CONFIG_SCHED_MC
7053 /* Set up multi-core groups */ 7282 /* Set up multi-core groups */
7054 for_each_cpu_mask(i, *cpu_map) { 7283 for_each_cpu_mask_nr(i, *cpu_map) {
7055 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7284 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7056 SCHED_CPUMASK_VAR(send_covered, allmasks); 7285 SCHED_CPUMASK_VAR(send_covered, allmasks);
7057 7286
@@ -7067,7 +7296,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7067#endif 7296#endif
7068 7297
7069 /* Set up physical groups */ 7298 /* Set up physical groups */
7070 for (i = 0; i < MAX_NUMNODES; i++) { 7299 for (i = 0; i < nr_node_ids; i++) {
7071 SCHED_CPUMASK_VAR(nodemask, allmasks); 7300 SCHED_CPUMASK_VAR(nodemask, allmasks);
7072 SCHED_CPUMASK_VAR(send_covered, allmasks); 7301 SCHED_CPUMASK_VAR(send_covered, allmasks);
7073 7302
@@ -7091,7 +7320,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7091 send_covered, tmpmask); 7320 send_covered, tmpmask);
7092 } 7321 }
7093 7322
7094 for (i = 0; i < MAX_NUMNODES; i++) { 7323 for (i = 0; i < nr_node_ids; i++) {
7095 /* Set up node groups */ 7324 /* Set up node groups */
7096 struct sched_group *sg, *prev; 7325 struct sched_group *sg, *prev;
7097 SCHED_CPUMASK_VAR(nodemask, allmasks); 7326 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7118,7 +7347,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7118 goto error; 7347 goto error;
7119 } 7348 }
7120 sched_group_nodes[i] = sg; 7349 sched_group_nodes[i] = sg;
7121 for_each_cpu_mask(j, *nodemask) { 7350 for_each_cpu_mask_nr(j, *nodemask) {
7122 struct sched_domain *sd; 7351 struct sched_domain *sd;
7123 7352
7124 sd = &per_cpu(node_domains, j); 7353 sd = &per_cpu(node_domains, j);
@@ -7130,9 +7359,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7130 cpus_or(*covered, *covered, *nodemask); 7359 cpus_or(*covered, *covered, *nodemask);
7131 prev = sg; 7360 prev = sg;
7132 7361
7133 for (j = 0; j < MAX_NUMNODES; j++) { 7362 for (j = 0; j < nr_node_ids; j++) {
7134 SCHED_CPUMASK_VAR(notcovered, allmasks); 7363 SCHED_CPUMASK_VAR(notcovered, allmasks);
7135 int n = (i + j) % MAX_NUMNODES; 7364 int n = (i + j) % nr_node_ids;
7136 node_to_cpumask_ptr(pnodemask, n); 7365 node_to_cpumask_ptr(pnodemask, n);
7137 7366
7138 cpus_complement(*notcovered, *covered); 7367 cpus_complement(*notcovered, *covered);
@@ -7164,28 +7393,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7164 7393
7165 /* Calculate CPU power for physical packages and nodes */ 7394 /* Calculate CPU power for physical packages and nodes */
7166#ifdef CONFIG_SCHED_SMT 7395#ifdef CONFIG_SCHED_SMT
7167 for_each_cpu_mask(i, *cpu_map) { 7396 for_each_cpu_mask_nr(i, *cpu_map) {
7168 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7397 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7169 7398
7170 init_sched_groups_power(i, sd); 7399 init_sched_groups_power(i, sd);
7171 } 7400 }
7172#endif 7401#endif
7173#ifdef CONFIG_SCHED_MC 7402#ifdef CONFIG_SCHED_MC
7174 for_each_cpu_mask(i, *cpu_map) { 7403 for_each_cpu_mask_nr(i, *cpu_map) {
7175 struct sched_domain *sd = &per_cpu(core_domains, i); 7404 struct sched_domain *sd = &per_cpu(core_domains, i);
7176 7405
7177 init_sched_groups_power(i, sd); 7406 init_sched_groups_power(i, sd);
7178 } 7407 }
7179#endif 7408#endif
7180 7409
7181 for_each_cpu_mask(i, *cpu_map) { 7410 for_each_cpu_mask_nr(i, *cpu_map) {
7182 struct sched_domain *sd = &per_cpu(phys_domains, i); 7411 struct sched_domain *sd = &per_cpu(phys_domains, i);
7183 7412
7184 init_sched_groups_power(i, sd); 7413 init_sched_groups_power(i, sd);
7185 } 7414 }
7186 7415
7187#ifdef CONFIG_NUMA 7416#ifdef CONFIG_NUMA
7188 for (i = 0; i < MAX_NUMNODES; i++) 7417 for (i = 0; i < nr_node_ids; i++)
7189 init_numa_sched_groups_power(sched_group_nodes[i]); 7418 init_numa_sched_groups_power(sched_group_nodes[i]);
7190 7419
7191 if (sd_allnodes) { 7420 if (sd_allnodes) {
@@ -7198,7 +7427,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7198#endif 7427#endif
7199 7428
7200 /* Attach the domains */ 7429 /* Attach the domains */
7201 for_each_cpu_mask(i, *cpu_map) { 7430 for_each_cpu_mask_nr(i, *cpu_map) {
7202 struct sched_domain *sd; 7431 struct sched_domain *sd;
7203#ifdef CONFIG_SCHED_SMT 7432#ifdef CONFIG_SCHED_SMT
7204 sd = &per_cpu(cpu_domains, i); 7433 sd = &per_cpu(cpu_domains, i);
@@ -7243,18 +7472,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7243} 7472}
7244 7473
7245/* 7474/*
7246 * Free current domain masks.
7247 * Called after all cpus are attached to NULL domain.
7248 */
7249static void free_sched_domains(void)
7250{
7251 ndoms_cur = 0;
7252 if (doms_cur != &fallback_doms)
7253 kfree(doms_cur);
7254 doms_cur = &fallback_doms;
7255}
7256
7257/*
7258 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7475 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7259 * For now this just excludes isolated cpus, but could be used to 7476 * For now this just excludes isolated cpus, but could be used to
7260 * exclude other special cases in the future. 7477 * exclude other special cases in the future.
@@ -7293,7 +7510,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7293 7510
7294 unregister_sched_domain_sysctl(); 7511 unregister_sched_domain_sysctl();
7295 7512
7296 for_each_cpu_mask(i, *cpu_map) 7513 for_each_cpu_mask_nr(i, *cpu_map)
7297 cpu_attach_domain(NULL, &def_root_domain, i); 7514 cpu_attach_domain(NULL, &def_root_domain, i);
7298 synchronize_sched(); 7515 synchronize_sched();
7299 arch_destroy_sched_domains(cpu_map, &tmpmask); 7516 arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7332,7 +7549,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7332 * ownership of it and will kfree it when done with it. If the caller 7549 * ownership of it and will kfree it when done with it. If the caller
7333 * failed the kmalloc call, then it can pass in doms_new == NULL, 7550 * failed the kmalloc call, then it can pass in doms_new == NULL,
7334 * and partition_sched_domains() will fallback to the single partition 7551 * and partition_sched_domains() will fallback to the single partition
7335 * 'fallback_doms'. 7552 * 'fallback_doms', it also forces the domains to be rebuilt.
7336 * 7553 *
7337 * Call with hotplug lock held 7554 * Call with hotplug lock held
7338 */ 7555 */
@@ -7346,12 +7563,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7346 /* always unregister in case we don't destroy any domains */ 7563 /* always unregister in case we don't destroy any domains */
7347 unregister_sched_domain_sysctl(); 7564 unregister_sched_domain_sysctl();
7348 7565
7349 if (doms_new == NULL) { 7566 if (doms_new == NULL)
7350 ndoms_new = 1; 7567 ndoms_new = 0;
7351 doms_new = &fallback_doms;
7352 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7353 dattr_new = NULL;
7354 }
7355 7568
7356 /* Destroy deleted domains */ 7569 /* Destroy deleted domains */
7357 for (i = 0; i < ndoms_cur; i++) { 7570 for (i = 0; i < ndoms_cur; i++) {
@@ -7366,6 +7579,14 @@ match1:
7366 ; 7579 ;
7367 } 7580 }
7368 7581
7582 if (doms_new == NULL) {
7583 ndoms_cur = 0;
7584 ndoms_new = 1;
7585 doms_new = &fallback_doms;
7586 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7587 dattr_new = NULL;
7588 }
7589
7369 /* Build new domains */ 7590 /* Build new domains */
7370 for (i = 0; i < ndoms_new; i++) { 7591 for (i = 0; i < ndoms_new; i++) {
7371 for (j = 0; j < ndoms_cur; j++) { 7592 for (j = 0; j < ndoms_cur; j++) {
@@ -7396,17 +7617,10 @@ match2:
7396#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7617#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7397int arch_reinit_sched_domains(void) 7618int arch_reinit_sched_domains(void)
7398{ 7619{
7399 int err;
7400
7401 get_online_cpus(); 7620 get_online_cpus();
7402 mutex_lock(&sched_domains_mutex); 7621 rebuild_sched_domains();
7403 detach_destroy_domains(&cpu_online_map);
7404 free_sched_domains();
7405 err = arch_init_sched_domains(&cpu_online_map);
7406 mutex_unlock(&sched_domains_mutex);
7407 put_online_cpus(); 7622 put_online_cpus();
7408 7623 return 0;
7409 return err;
7410} 7624}
7411 7625
7412static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7626static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7427,11 +7641,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7427} 7641}
7428 7642
7429#ifdef CONFIG_SCHED_MC 7643#ifdef CONFIG_SCHED_MC
7430static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 7644static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
7645 struct sysdev_attribute *attr, char *page)
7431{ 7646{
7432 return sprintf(page, "%u\n", sched_mc_power_savings); 7647 return sprintf(page, "%u\n", sched_mc_power_savings);
7433} 7648}
7434static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7649static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
7650 struct sysdev_attribute *attr,
7435 const char *buf, size_t count) 7651 const char *buf, size_t count)
7436{ 7652{
7437 return sched_power_savings_store(buf, count, 0); 7653 return sched_power_savings_store(buf, count, 0);
@@ -7441,11 +7657,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
7441#endif 7657#endif
7442 7658
7443#ifdef CONFIG_SCHED_SMT 7659#ifdef CONFIG_SCHED_SMT
7444static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 7660static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
7661 struct sysdev_attribute *attr, char *page)
7445{ 7662{
7446 return sprintf(page, "%u\n", sched_smt_power_savings); 7663 return sprintf(page, "%u\n", sched_smt_power_savings);
7447} 7664}
7448static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7665static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
7666 struct sysdev_attribute *attr,
7449 const char *buf, size_t count) 7667 const char *buf, size_t count)
7450{ 7668{
7451 return sched_power_savings_store(buf, count, 1); 7669 return sched_power_savings_store(buf, count, 1);
@@ -7470,54 +7688,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7470#endif 7688#endif
7471 return err; 7689 return err;
7472} 7690}
7473#endif 7691#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7474 7692
7693#ifndef CONFIG_CPUSETS
7475/* 7694/*
7476 * Force a reinitialization of the sched domains hierarchy. The domains 7695 * Add online and remove offline CPUs from the scheduler domains.
7477 * and groups cannot be updated in place without racing with the balancing 7696 * When cpusets are enabled they take over this function.
7478 * code, so we temporarily attach all running cpus to the NULL domain
7479 * which will prevent rebalancing while the sched domains are recalculated.
7480 */ 7697 */
7481static int update_sched_domains(struct notifier_block *nfb, 7698static int update_sched_domains(struct notifier_block *nfb,
7482 unsigned long action, void *hcpu) 7699 unsigned long action, void *hcpu)
7483{ 7700{
7484 switch (action) { 7701 switch (action) {
7485 case CPU_UP_PREPARE: 7702 case CPU_ONLINE:
7486 case CPU_UP_PREPARE_FROZEN: 7703 case CPU_ONLINE_FROZEN:
7704 case CPU_DEAD:
7705 case CPU_DEAD_FROZEN:
7706 partition_sched_domains(0, NULL, NULL);
7707 return NOTIFY_OK;
7708
7709 default:
7710 return NOTIFY_DONE;
7711 }
7712}
7713#endif
7714
7715static int update_runtime(struct notifier_block *nfb,
7716 unsigned long action, void *hcpu)
7717{
7718 int cpu = (int)(long)hcpu;
7719
7720 switch (action) {
7487 case CPU_DOWN_PREPARE: 7721 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN: 7722 case CPU_DOWN_PREPARE_FROZEN:
7489 detach_destroy_domains(&cpu_online_map); 7723 disable_runtime(cpu_rq(cpu));
7490 free_sched_domains();
7491 return NOTIFY_OK; 7724 return NOTIFY_OK;
7492 7725
7493 case CPU_UP_CANCELED:
7494 case CPU_UP_CANCELED_FROZEN:
7495 case CPU_DOWN_FAILED: 7726 case CPU_DOWN_FAILED:
7496 case CPU_DOWN_FAILED_FROZEN: 7727 case CPU_DOWN_FAILED_FROZEN:
7497 case CPU_ONLINE: 7728 case CPU_ONLINE:
7498 case CPU_ONLINE_FROZEN: 7729 case CPU_ONLINE_FROZEN:
7499 case CPU_DEAD: 7730 enable_runtime(cpu_rq(cpu));
7500 case CPU_DEAD_FROZEN: 7731 return NOTIFY_OK;
7501 /* 7732
7502 * Fall through and re-initialise the domains.
7503 */
7504 break;
7505 default: 7733 default:
7506 return NOTIFY_DONE; 7734 return NOTIFY_DONE;
7507 } 7735 }
7508
7509#ifndef CONFIG_CPUSETS
7510 /*
7511 * Create default domain partitioning if cpusets are disabled.
7512 * Otherwise we let cpusets rebuild the domains based on the
7513 * current setup.
7514 */
7515
7516 /* The hotplug lock is already held by cpu_up/cpu_down */
7517 arch_init_sched_domains(&cpu_online_map);
7518#endif
7519
7520 return NOTIFY_OK;
7521} 7736}
7522 7737
7523void __init sched_init_smp(void) 7738void __init sched_init_smp(void)
@@ -7537,8 +7752,15 @@ void __init sched_init_smp(void)
7537 cpu_set(smp_processor_id(), non_isolated_cpus); 7752 cpu_set(smp_processor_id(), non_isolated_cpus);
7538 mutex_unlock(&sched_domains_mutex); 7753 mutex_unlock(&sched_domains_mutex);
7539 put_online_cpus(); 7754 put_online_cpus();
7755
7756#ifndef CONFIG_CPUSETS
7540 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7757 /* XXX: Theoretical race here - CPU may be hotplugged now */
7541 hotcpu_notifier(update_sched_domains, 0); 7758 hotcpu_notifier(update_sched_domains, 0);
7759#endif
7760
7761 /* RT runtime code needs to handle some hotplug events */
7762 hotcpu_notifier(update_runtime, 0);
7763
7542 init_hrtick(); 7764 init_hrtick();
7543 7765
7544 /* Move init over to a non-isolated CPU */ 7766 /* Move init over to a non-isolated CPU */
@@ -7695,8 +7917,8 @@ void __init sched_init(void)
7695 7917
7696 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7918 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7697 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
7698#endif 7920#endif /* CONFIG_USER_SCHED */
7699#endif 7921#endif /* CONFIG_FAIR_GROUP_SCHED */
7700#ifdef CONFIG_RT_GROUP_SCHED 7922#ifdef CONFIG_RT_GROUP_SCHED
7701 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7923 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7702 ptr += nr_cpu_ids * sizeof(void **); 7924 ptr += nr_cpu_ids * sizeof(void **);
@@ -7710,8 +7932,8 @@ void __init sched_init(void)
7710 7932
7711 root_task_group.rt_rq = (struct rt_rq **)ptr; 7933 root_task_group.rt_rq = (struct rt_rq **)ptr;
7712 ptr += nr_cpu_ids * sizeof(void **); 7934 ptr += nr_cpu_ids * sizeof(void **);
7713#endif 7935#endif /* CONFIG_USER_SCHED */
7714#endif 7936#endif /* CONFIG_RT_GROUP_SCHED */
7715 } 7937 }
7716 7938
7717#ifdef CONFIG_SMP 7939#ifdef CONFIG_SMP
@@ -7727,8 +7949,8 @@ void __init sched_init(void)
7727#ifdef CONFIG_USER_SCHED 7949#ifdef CONFIG_USER_SCHED
7728 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7950 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7729 global_rt_period(), RUNTIME_INF); 7951 global_rt_period(), RUNTIME_INF);
7730#endif 7952#endif /* CONFIG_USER_SCHED */
7731#endif 7953#endif /* CONFIG_RT_GROUP_SCHED */
7732 7954
7733#ifdef CONFIG_GROUP_SCHED 7955#ifdef CONFIG_GROUP_SCHED
7734 list_add(&init_task_group.list, &task_groups); 7956 list_add(&init_task_group.list, &task_groups);
@@ -7738,8 +7960,8 @@ void __init sched_init(void)
7738 INIT_LIST_HEAD(&root_task_group.children); 7960 INIT_LIST_HEAD(&root_task_group.children);
7739 init_task_group.parent = &root_task_group; 7961 init_task_group.parent = &root_task_group;
7740 list_add(&init_task_group.siblings, &root_task_group.children); 7962 list_add(&init_task_group.siblings, &root_task_group.children);
7741#endif 7963#endif /* CONFIG_USER_SCHED */
7742#endif 7964#endif /* CONFIG_GROUP_SCHED */
7743 7965
7744 for_each_possible_cpu(i) { 7966 for_each_possible_cpu(i) {
7745 struct rq *rq; 7967 struct rq *rq;
@@ -7819,6 +8041,7 @@ void __init sched_init(void)
7819 rq->next_balance = jiffies; 8041 rq->next_balance = jiffies;
7820 rq->push_cpu = 0; 8042 rq->push_cpu = 0;
7821 rq->cpu = i; 8043 rq->cpu = i;
8044 rq->online = 0;
7822 rq->migration_thread = NULL; 8045 rq->migration_thread = NULL;
7823 INIT_LIST_HEAD(&rq->migration_queue); 8046 INIT_LIST_HEAD(&rq->migration_queue);
7824 rq_attach_root(rq, &def_root_domain); 8047 rq_attach_root(rq, &def_root_domain);
@@ -7834,7 +8057,7 @@ void __init sched_init(void)
7834#endif 8057#endif
7835 8058
7836#ifdef CONFIG_SMP 8059#ifdef CONFIG_SMP
7837 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8060 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7838#endif 8061#endif
7839 8062
7840#ifdef CONFIG_RT_MUTEXES 8063#ifdef CONFIG_RT_MUTEXES
@@ -8058,7 +8281,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8058{ 8281{
8059 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8282 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8060} 8283}
8061#else 8284#else /* !CONFG_FAIR_GROUP_SCHED */
8062static inline void free_fair_sched_group(struct task_group *tg) 8285static inline void free_fair_sched_group(struct task_group *tg)
8063{ 8286{
8064} 8287}
@@ -8076,7 +8299,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8076static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8299static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8077{ 8300{
8078} 8301}
8079#endif 8302#endif /* CONFIG_FAIR_GROUP_SCHED */
8080 8303
8081#ifdef CONFIG_RT_GROUP_SCHED 8304#ifdef CONFIG_RT_GROUP_SCHED
8082static void free_rt_sched_group(struct task_group *tg) 8305static void free_rt_sched_group(struct task_group *tg)
@@ -8147,7 +8370,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8147{ 8370{
8148 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8371 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8149} 8372}
8150#else 8373#else /* !CONFIG_RT_GROUP_SCHED */
8151static inline void free_rt_sched_group(struct task_group *tg) 8374static inline void free_rt_sched_group(struct task_group *tg)
8152{ 8375{
8153} 8376}
@@ -8165,7 +8388,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8165static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8388static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8166{ 8389{
8167} 8390}
8168#endif 8391#endif /* CONFIG_RT_GROUP_SCHED */
8169 8392
8170#ifdef CONFIG_GROUP_SCHED 8393#ifdef CONFIG_GROUP_SCHED
8171static void free_sched_group(struct task_group *tg) 8394static void free_sched_group(struct task_group *tg)
@@ -8276,17 +8499,14 @@ void sched_move_task(struct task_struct *tsk)
8276 8499
8277 task_rq_unlock(rq, &flags); 8500 task_rq_unlock(rq, &flags);
8278} 8501}
8279#endif 8502#endif /* CONFIG_GROUP_SCHED */
8280 8503
8281#ifdef CONFIG_FAIR_GROUP_SCHED 8504#ifdef CONFIG_FAIR_GROUP_SCHED
8282static void set_se_shares(struct sched_entity *se, unsigned long shares) 8505static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8283{ 8506{
8284 struct cfs_rq *cfs_rq = se->cfs_rq; 8507 struct cfs_rq *cfs_rq = se->cfs_rq;
8285 struct rq *rq = cfs_rq->rq;
8286 int on_rq; 8508 int on_rq;
8287 8509
8288 spin_lock_irq(&rq->lock);
8289
8290 on_rq = se->on_rq; 8510 on_rq = se->on_rq;
8291 if (on_rq) 8511 if (on_rq)
8292 dequeue_entity(cfs_rq, se, 0); 8512 dequeue_entity(cfs_rq, se, 0);
@@ -8296,8 +8516,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8296 8516
8297 if (on_rq) 8517 if (on_rq)
8298 enqueue_entity(cfs_rq, se, 0); 8518 enqueue_entity(cfs_rq, se, 0);
8519}
8299 8520
8300 spin_unlock_irq(&rq->lock); 8521static void set_se_shares(struct sched_entity *se, unsigned long shares)
8522{
8523 struct cfs_rq *cfs_rq = se->cfs_rq;
8524 struct rq *rq = cfs_rq->rq;
8525 unsigned long flags;
8526
8527 spin_lock_irqsave(&rq->lock, flags);
8528 __set_se_shares(se, shares);
8529 spin_unlock_irqrestore(&rq->lock, flags);
8301} 8530}
8302 8531
8303static DEFINE_MUTEX(shares_mutex); 8532static DEFINE_MUTEX(shares_mutex);
@@ -8336,8 +8565,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8336 * w/o tripping rebalance_share or load_balance_fair. 8565 * w/o tripping rebalance_share or load_balance_fair.
8337 */ 8566 */
8338 tg->shares = shares; 8567 tg->shares = shares;
8339 for_each_possible_cpu(i) 8568 for_each_possible_cpu(i) {
8569 /*
8570 * force a rebalance
8571 */
8572 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8340 set_se_shares(tg->se[i], shares); 8573 set_se_shares(tg->se[i], shares);
8574 }
8341 8575
8342 /* 8576 /*
8343 * Enable load balance activity on this group, by inserting it back on 8577 * Enable load balance activity on this group, by inserting it back on
@@ -8376,7 +8610,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8376#ifdef CONFIG_CGROUP_SCHED 8610#ifdef CONFIG_CGROUP_SCHED
8377static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8611static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8378{ 8612{
8379 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8613 struct task_group *tgi, *parent = tg->parent;
8380 unsigned long total = 0; 8614 unsigned long total = 0;
8381 8615
8382 if (!parent) { 8616 if (!parent) {
@@ -8400,7 +8634,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8400 } 8634 }
8401 rcu_read_unlock(); 8635 rcu_read_unlock();
8402 8636
8403 return total + to_ratio(period, runtime) < 8637 return total + to_ratio(period, runtime) <=
8404 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8638 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8405 parent->rt_bandwidth.rt_runtime); 8639 parent->rt_bandwidth.rt_runtime);
8406} 8640}
@@ -8520,16 +8754,21 @@ long sched_group_rt_period(struct task_group *tg)
8520 8754
8521static int sched_rt_global_constraints(void) 8755static int sched_rt_global_constraints(void)
8522{ 8756{
8757 struct task_group *tg = &root_task_group;
8758 u64 rt_runtime, rt_period;
8523 int ret = 0; 8759 int ret = 0;
8524 8760
8761 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8762 rt_runtime = tg->rt_bandwidth.rt_runtime;
8763
8525 mutex_lock(&rt_constraints_mutex); 8764 mutex_lock(&rt_constraints_mutex);
8526 if (!__rt_schedulable(NULL, 1, 0)) 8765 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8527 ret = -EINVAL; 8766 ret = -EINVAL;
8528 mutex_unlock(&rt_constraints_mutex); 8767 mutex_unlock(&rt_constraints_mutex);
8529 8768
8530 return ret; 8769 return ret;
8531} 8770}
8532#else 8771#else /* !CONFIG_RT_GROUP_SCHED */
8533static int sched_rt_global_constraints(void) 8772static int sched_rt_global_constraints(void)
8534{ 8773{
8535 unsigned long flags; 8774 unsigned long flags;
@@ -8547,7 +8786,7 @@ static int sched_rt_global_constraints(void)
8547 8786
8548 return 0; 8787 return 0;
8549} 8788}
8550#endif 8789#endif /* CONFIG_RT_GROUP_SCHED */
8551 8790
8552int sched_rt_handler(struct ctl_table *table, int write, 8791int sched_rt_handler(struct ctl_table *table, int write,
8553 struct file *filp, void __user *buffer, size_t *lenp, 8792 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8655,7 +8894,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8655 8894
8656 return (u64) tg->shares; 8895 return (u64) tg->shares;
8657} 8896}
8658#endif 8897#endif /* CONFIG_FAIR_GROUP_SCHED */
8659 8898
8660#ifdef CONFIG_RT_GROUP_SCHED 8899#ifdef CONFIG_RT_GROUP_SCHED
8661static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8900static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8679,7 +8918,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8679{ 8918{
8680 return sched_group_rt_period(cgroup_tg(cgrp)); 8919 return sched_group_rt_period(cgroup_tg(cgrp));
8681} 8920}
8682#endif 8921#endif /* CONFIG_RT_GROUP_SCHED */
8683 8922
8684static struct cftype cpu_files[] = { 8923static struct cftype cpu_files[] = {
8685#ifdef CONFIG_FAIR_GROUP_SCHED 8924#ifdef CONFIG_FAIR_GROUP_SCHED