aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1215
1 files changed, 742 insertions, 473 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 4e2f60335656..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0 or 1 can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 366#else
364 367
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
366 373
367#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
368 375
@@ -373,6 +380,7 @@ struct cfs_rq {
373 380
374 u64 exec_clock; 381 u64 exec_clock;
375 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
376 384
377 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -401,6 +409,31 @@ struct cfs_rq {
401 */ 409 */
402 struct list_head leaf_cfs_rq_list; 410 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
412
413#ifdef CONFIG_SMP
414 /*
415 * the part of load.weight contributed by tasks
416 */
417 unsigned long task_weight;
418
419 /*
420 * h_load = weight * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
426
427 /*
428 * this cpu's part of tg->shares
429 */
430 unsigned long shares;
431
432 /*
433 * load.weight at the time we set shares
434 */
435 unsigned long rq_weight;
436#endif
404#endif 437#endif
405}; 438};
406 439
@@ -452,6 +485,9 @@ struct root_domain {
452 */ 485 */
453 cpumask_t rto_mask; 486 cpumask_t rto_mask;
454 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
455}; 491};
456 492
457/* 493/*
@@ -526,14 +562,19 @@ struct rq {
526 int push_cpu; 562 int push_cpu;
527 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
528 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
529 568
530 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
531 struct list_head migration_queue; 570 struct list_head migration_queue;
532#endif 571#endif
533 572
534#ifdef CONFIG_SCHED_HRTICK 573#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags; 574#ifdef CONFIG_SMP
536 ktime_t hrtick_expire; 575 int hrtick_csd_pending;
576 struct call_single_data hrtick_csd;
577#endif
537 struct hrtimer hrtick_timer; 578 struct hrtimer hrtick_timer;
538#endif 579#endif
539 580
@@ -607,6 +648,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 648# define const_debug static const
608#endif 649#endif
609 650
651/**
652 * runqueue_is_locked
653 *
654 * Returns true if the current cpu runqueue is locked.
655 * This interface allows printk to be called with the runqueue lock
656 * held and know whether or not it is OK to wake up the klogd.
657 */
658int runqueue_is_locked(void)
659{
660 int cpu = get_cpu();
661 struct rq *rq = cpu_rq(cpu);
662 int ret;
663
664 ret = spin_is_locked(&rq->lock);
665 put_cpu();
666 return ret;
667}
668
610/* 669/*
611 * Debugging: various feature bits 670 * Debugging: various feature bits
612 */ 671 */
@@ -749,6 +808,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 808const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 809
751/* 810/*
811 * ratelimit for updating the group shares.
812 * default: 0.5ms
813 */
814const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
815
816/*
752 * period over which we measure -rt task cpu usage in us. 817 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 818 * default: 1s
754 */ 819 */
@@ -775,82 +840,6 @@ static inline u64 global_rt_runtime(void)
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 840 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 841}
777 842
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 843#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 844# define prepare_arch_switch(next) do { } while (0)
856#endif 845#endif
@@ -996,13 +985,6 @@ static struct rq *this_rq_lock(void)
996 return rq; 985 return rq;
997} 986}
998 987
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK 988#ifdef CONFIG_SCHED_HRTICK
1007/* 989/*
1008 * Use HR-timers to deliver accurate preemption points. 990 * Use HR-timers to deliver accurate preemption points.
@@ -1014,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
1014 * When we get rescheduled we reprogram the hrtick_timer outside of the 996 * When we get rescheduled we reprogram the hrtick_timer outside of the
1015 * rq->lock. 997 * rq->lock.
1016 */ 998 */
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET, /* re-programm hrtick_timer */
1033 HRTICK_RESET, /* not a new slice */
1034 HRTICK_BLOCK, /* stop hrtick operations */
1035};
1036 999
1037/* 1000/*
1038 * Use hrtick when: 1001 * Use hrtick when:
@@ -1043,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq)
1043{ 1006{
1044 if (!sched_feat(HRTICK)) 1007 if (!sched_feat(HRTICK))
1045 return 0; 1008 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) 1009 if (!cpu_active(cpu_of(rq)))
1047 return 0; 1010 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer); 1011 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049} 1012}
1050 1013
1051/*
1052 * Called to set the hrtick timer state.
1053 *
1054 * called with rq->lock held and irqs disabled
1055 */
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060 /*
1061 * preempt at: now + delay
1062 */
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065 /*
1066 * indicate we need to program the timer
1067 */
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072 /*
1073 * New slices are called from the schedule path and don't need a
1074 * forced reschedule.
1075 */
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq) 1014static void hrtick_clear(struct rq *rq)
1081{ 1015{
1082 if (hrtimer_active(&rq->hrtick_timer)) 1016 if (hrtimer_active(&rq->hrtick_timer))
@@ -1084,32 +1018,6 @@ static void hrtick_clear(struct rq *rq)
1084} 1018}
1085 1019
1086/* 1020/*
1087 * Update the timer from the possible pending state.
1088 */
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112/*
1113 * High-resolution timer tick. 1021 * High-resolution timer tick.
1114 * Runs from hardirq context with interrupts disabled. 1022 * Runs from hardirq context with interrupts disabled.
1115 */ 1023 */
@@ -1128,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1128} 1036}
1129 1037
1130#ifdef CONFIG_SMP 1038#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu) 1039/*
1040 * called from hardirq (IPI) context
1041 */
1042static void __hrtick_start(void *arg)
1132{ 1043{
1133 struct rq *rq = cpu_rq(cpu); 1044 struct rq *rq = arg;
1134 unsigned long flags;
1135 1045
1136 spin_lock_irqsave(&rq->lock, flags); 1046 spin_lock(&rq->lock);
1137 rq->hrtick_flags = 0; 1047 hrtimer_restart(&rq->hrtick_timer);
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1048 rq->hrtick_csd_pending = 0;
1139 spin_unlock_irqrestore(&rq->lock, flags); 1049 spin_unlock(&rq->lock);
1140
1141 hrtick_clear(rq);
1142} 1050}
1143 1051
1144static void hotplug_hrtick_enable(int cpu) 1052/*
1053 * Called to set the hrtick timer state.
1054 *
1055 * called with rq->lock held and irqs disabled
1056 */
1057static void hrtick_start(struct rq *rq, u64 delay)
1145{ 1058{
1146 struct rq *rq = cpu_rq(cpu); 1059 struct hrtimer *timer = &rq->hrtick_timer;
1147 unsigned long flags; 1060 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1148 1061
1149 spin_lock_irqsave(&rq->lock, flags); 1062 timer->expires = time;
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1063
1151 spin_unlock_irqrestore(&rq->lock, flags); 1064 if (rq == this_rq()) {
1065 hrtimer_restart(timer);
1066 } else if (!rq->hrtick_csd_pending) {
1067 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1068 rq->hrtick_csd_pending = 1;
1069 }
1152} 1070}
1153 1071
1154static int 1072static int
@@ -1163,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1163 case CPU_DOWN_PREPARE_FROZEN: 1081 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD: 1082 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN: 1083 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu); 1084 hrtick_clear(cpu_rq(cpu));
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK; 1085 return NOTIFY_OK;
1177 } 1086 }
1178 1087
@@ -1183,46 +1092,45 @@ static void init_hrtick(void)
1183{ 1092{
1184 hotcpu_notifier(hotplug_hrtick, 0); 1093 hotcpu_notifier(hotplug_hrtick, 0);
1185} 1094}
1186#endif /* CONFIG_SMP */ 1095#else
1096/*
1097 * Called to set the hrtick timer state.
1098 *
1099 * called with rq->lock held and irqs disabled
1100 */
1101static void hrtick_start(struct rq *rq, u64 delay)
1102{
1103 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1104}
1187 1105
1188static void init_rq_hrtick(struct rq *rq) 1106static void init_hrtick(void)
1189{ 1107{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194} 1108}
1109#endif /* CONFIG_SMP */
1195 1110
1196void hrtick_resched(void) 1111static void init_rq_hrtick(struct rq *rq)
1197{ 1112{
1198 struct rq *rq; 1113#ifdef CONFIG_SMP
1199 unsigned long flags; 1114 rq->hrtick_csd_pending = 0;
1200 1115
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED)) 1116 rq->hrtick_csd.flags = 0;
1202 return; 1117 rq->hrtick_csd.func = __hrtick_start;
1118 rq->hrtick_csd.info = rq;
1119#endif
1203 1120
1204 local_irq_save(flags); 1121 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1205 rq = cpu_rq(smp_processor_id()); 1122 rq->hrtick_timer.function = hrtick;
1206 hrtick_set(rq); 1123 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1207 local_irq_restore(flags);
1208} 1124}
1209#else 1125#else
1210static inline void hrtick_clear(struct rq *rq) 1126static inline void hrtick_clear(struct rq *rq)
1211{ 1127{
1212} 1128}
1213 1129
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq) 1130static inline void init_rq_hrtick(struct rq *rq)
1219{ 1131{
1220} 1132}
1221 1133
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void) 1134static inline void init_hrtick(void)
1227{ 1135{
1228} 1136}
@@ -1241,16 +1149,16 @@ static inline void init_hrtick(void)
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1149#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif 1150#endif
1243 1151
1244static void __resched_task(struct task_struct *p, int tif_bit) 1152static void resched_task(struct task_struct *p)
1245{ 1153{
1246 int cpu; 1154 int cpu;
1247 1155
1248 assert_spin_locked(&task_rq(p)->lock); 1156 assert_spin_locked(&task_rq(p)->lock);
1249 1157
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit))) 1158 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1251 return; 1159 return;
1252 1160
1253 set_tsk_thread_flag(p, tif_bit); 1161 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1254 1162
1255 cpu = task_cpu(p); 1163 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id()) 1164 if (cpu == smp_processor_id())
@@ -1313,15 +1221,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1221 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1222 smp_send_reschedule(cpu);
1315} 1223}
1316#endif 1224#endif /* CONFIG_NO_HZ */
1317 1225
1318#else 1226#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1227static void resched_task(struct task_struct *p)
1320{ 1228{
1321 assert_spin_locked(&task_rq(p)->lock); 1229 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1230 set_tsk_need_resched(p);
1323} 1231}
1324#endif 1232#endif /* CONFIG_SMP */
1325 1233
1326#if BITS_PER_LONG == 32 1234#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1235# define WMULT_CONST (~0UL)
@@ -1336,6 +1244,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1244 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1245#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1246
1247/*
1248 * delta *= weight / lw
1249 */
1339static unsigned long 1250static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1251calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1252 struct load_weight *lw)
@@ -1363,12 +1274,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1274 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1275}
1365 1276
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1277static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1278{
1374 lw->weight += inc; 1279 lw->weight += inc;
@@ -1479,17 +1384,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1479#ifdef CONFIG_SMP 1384#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1385static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1386static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1387static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1388
1389static unsigned long cpu_avg_load_per_task(int cpu)
1390{
1391 struct rq *rq = cpu_rq(cpu);
1392
1393 if (rq->nr_running)
1394 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1395
1396 return rq->avg_load_per_task;
1397}
1485 1398
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1399#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1400
1401typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1402
1403/*
1404 * Iterate the full tree, calling @down when first entering a node and @up when
1405 * leaving it for the final time.
1406 */
1407static void
1408walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1409{
1410 struct task_group *parent, *child;
1411
1412 rcu_read_lock();
1413 parent = &root_task_group;
1414down:
1415 (*down)(parent, cpu, sd);
1416 list_for_each_entry_rcu(child, &parent->children, siblings) {
1417 parent = child;
1418 goto down;
1419
1420up:
1421 continue;
1422 }
1423 (*up)(parent, cpu, sd);
1424
1425 child = parent;
1426 parent = parent->parent;
1427 if (parent)
1428 goto up;
1429 rcu_read_unlock();
1430}
1431
1432static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1433
1434/*
1435 * Calculate and set the cpu's group shares.
1436 */
1437static void
1438__update_group_shares_cpu(struct task_group *tg, int cpu,
1439 unsigned long sd_shares, unsigned long sd_rq_weight)
1488{ 1440{
1441 int boost = 0;
1442 unsigned long shares;
1443 unsigned long rq_weight;
1444
1445 if (!tg->se[cpu])
1446 return;
1447
1448 rq_weight = tg->cfs_rq[cpu]->load.weight;
1449
1450 /*
1451 * If there are currently no tasks on the cpu pretend there is one of
1452 * average load so that when a new task gets to run here it will not
1453 * get delayed by group starvation.
1454 */
1455 if (!rq_weight) {
1456 boost = 1;
1457 rq_weight = NICE_0_LOAD;
1458 }
1459
1460 if (unlikely(rq_weight > sd_rq_weight))
1461 rq_weight = sd_rq_weight;
1462
1463 /*
1464 * \Sum shares * rq_weight
1465 * shares = -----------------------
1466 * \Sum rq_weight
1467 *
1468 */
1469 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1470
1471 /*
1472 * record the actual number of shares, not the boosted amount.
1473 */
1474 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1475 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1476
1477 if (shares < MIN_SHARES)
1478 shares = MIN_SHARES;
1479 else if (shares > MAX_SHARES)
1480 shares = MAX_SHARES;
1481
1482 __set_se_shares(tg->se[cpu], shares);
1483}
1484
1485/*
1486 * Re-compute the task group their per cpu shares over the given domain.
1487 * This needs to be done in a bottom-up fashion because the rq weight of a
1488 * parent group depends on the shares of its child groups.
1489 */
1490static void
1491tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1492{
1493 unsigned long rq_weight = 0;
1494 unsigned long shares = 0;
1495 int i;
1496
1497 for_each_cpu_mask(i, sd->span) {
1498 rq_weight += tg->cfs_rq[i]->load.weight;
1499 shares += tg->cfs_rq[i]->shares;
1500 }
1501
1502 if ((!shares && rq_weight) || shares > tg->shares)
1503 shares = tg->shares;
1504
1505 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1506 shares = tg->shares;
1507
1508 if (!rq_weight)
1509 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1510
1511 for_each_cpu_mask(i, sd->span) {
1512 struct rq *rq = cpu_rq(i);
1513 unsigned long flags;
1514
1515 spin_lock_irqsave(&rq->lock, flags);
1516 __update_group_shares_cpu(tg, i, shares, rq_weight);
1517 spin_unlock_irqrestore(&rq->lock, flags);
1518 }
1489} 1519}
1520
1521/*
1522 * Compute the cpu's hierarchical load factor for each task group.
1523 * This needs to be done in a top-down fashion because the load of a child
1524 * group is a fraction of its parents load.
1525 */
1526static void
1527tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1528{
1529 unsigned long load;
1530
1531 if (!tg->parent) {
1532 load = cpu_rq(cpu)->load.weight;
1533 } else {
1534 load = tg->parent->cfs_rq[cpu]->h_load;
1535 load *= tg->cfs_rq[cpu]->shares;
1536 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1537 }
1538
1539 tg->cfs_rq[cpu]->h_load = load;
1540}
1541
1542static void
1543tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1544{
1545}
1546
1547static void update_shares(struct sched_domain *sd)
1548{
1549 u64 now = cpu_clock(raw_smp_processor_id());
1550 s64 elapsed = now - sd->last_update;
1551
1552 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1553 sd->last_update = now;
1554 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1555 }
1556}
1557
1558static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1559{
1560 spin_unlock(&rq->lock);
1561 update_shares(sd);
1562 spin_lock(&rq->lock);
1563}
1564
1565static void update_h_load(int cpu)
1566{
1567 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1568}
1569
1570#else
1571
1572static inline void update_shares(struct sched_domain *sd)
1573{
1574}
1575
1576static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1577{
1578}
1579
1490#endif 1580#endif
1491 1581
1492#endif /* CONFIG_SMP */ 1582#endif
1583
1584#ifdef CONFIG_FAIR_GROUP_SCHED
1585static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1586{
1587#ifdef CONFIG_SMP
1588 cfs_rq->shares = shares;
1589#endif
1590}
1591#endif
1493 1592
1494#include "sched_stats.h" 1593#include "sched_stats.h"
1495#include "sched_idletask.c" 1594#include "sched_idletask.c"
@@ -1500,27 +1599,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1599#endif
1501 1600
1502#define sched_class_highest (&rt_sched_class) 1601#define sched_class_highest (&rt_sched_class)
1602#define for_each_class(class) \
1603 for (class = sched_class_highest; class; class = class->next)
1503 1604
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1605static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1606{
1516 rq->nr_running++; 1607 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1608}
1519 1609
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1610static void dec_nr_running(struct rq *rq)
1521{ 1611{
1522 rq->nr_running--; 1612 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1613}
1525 1614
1526static void set_load_weight(struct task_struct *p) 1615static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1633,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1633 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1634}
1546 1635
1636static void update_avg(u64 *avg, u64 sample)
1637{
1638 s64 diff = sample - *avg;
1639 *avg += diff >> 3;
1640}
1641
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1642static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1643{
1549 sched_info_queued(p); 1644 sched_info_queued(p);
@@ -1553,6 +1648,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1648
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1649static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1650{
1651 if (sleep && p->se.last_wakeup) {
1652 update_avg(&p->se.avg_overlap,
1653 p->se.sum_exec_runtime - p->se.last_wakeup);
1654 p->se.last_wakeup = 0;
1655 }
1656
1657 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1658 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1659 p->se.on_rq = 0;
1558} 1660}
@@ -1612,7 +1714,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1714 rq->nr_uninterruptible--;
1613 1715
1614 enqueue_task(rq, p, wakeup); 1716 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1717 inc_nr_running(rq);
1616} 1718}
1617 1719
1618/* 1720/*
@@ -1624,7 +1726,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1726 rq->nr_uninterruptible++;
1625 1727
1626 dequeue_task(rq, p, sleep); 1728 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1729 dec_nr_running(rq);
1628} 1730}
1629 1731
1630/** 1732/**
@@ -1636,12 +1738,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1738 return cpu_curr(task_cpu(p)) == p;
1637} 1739}
1638 1740
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1741static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1742{
1647 set_task_rq(p, cpu); 1743 set_task_rq(p, cpu);
@@ -1670,6 +1766,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1766
1671#ifdef CONFIG_SMP 1767#ifdef CONFIG_SMP
1672 1768
1769/* Used instead of source_load when we know the type == 0 */
1770static unsigned long weighted_cpuload(const int cpu)
1771{
1772 return cpu_rq(cpu)->load.weight;
1773}
1774
1673/* 1775/*
1674 * Is this task likely cache-hot: 1776 * Is this task likely cache-hot:
1675 */ 1777 */
@@ -1765,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1765/* 1867/*
1766 * wait_task_inactive - wait for a thread to unschedule. 1868 * wait_task_inactive - wait for a thread to unschedule.
1767 * 1869 *
1870 * If @match_state is nonzero, it's the @p->state value just checked and
1871 * not expected to change. If it changes, i.e. @p might have woken up,
1872 * then return zero. When we succeed in waiting for @p to be off its CPU,
1873 * we return a positive number (its total switch count). If a second call
1874 * a short while later returns the same number, the caller can be sure that
1875 * @p has remained unscheduled the whole time.
1876 *
1768 * The caller must ensure that the task *will* unschedule sometime soon, 1877 * The caller must ensure that the task *will* unschedule sometime soon,
1769 * else this function might spin for a *long* time. This function can't 1878 * else this function might spin for a *long* time. This function can't
1770 * be called with interrupts off, or it may introduce deadlock with 1879 * be called with interrupts off, or it may introduce deadlock with
1771 * smp_call_function() if an IPI is sent by the same process we are 1880 * smp_call_function() if an IPI is sent by the same process we are
1772 * waiting to become inactive. 1881 * waiting to become inactive.
1773 */ 1882 */
1774void wait_task_inactive(struct task_struct *p) 1883unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1775{ 1884{
1776 unsigned long flags; 1885 unsigned long flags;
1777 int running, on_rq; 1886 int running, on_rq;
1887 unsigned long ncsw;
1778 struct rq *rq; 1888 struct rq *rq;
1779 1889
1780 for (;;) { 1890 for (;;) {
@@ -1797,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
1797 * return false if the runqueue has changed and p 1907 * return false if the runqueue has changed and p
1798 * is actually now running somewhere else! 1908 * is actually now running somewhere else!
1799 */ 1909 */
1800 while (task_running(rq, p)) 1910 while (task_running(rq, p)) {
1911 if (match_state && unlikely(p->state != match_state))
1912 return 0;
1801 cpu_relax(); 1913 cpu_relax();
1914 }
1802 1915
1803 /* 1916 /*
1804 * Ok, time to look more closely! We need the rq 1917 * Ok, time to look more closely! We need the rq
@@ -1808,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p)
1808 rq = task_rq_lock(p, &flags); 1921 rq = task_rq_lock(p, &flags);
1809 running = task_running(rq, p); 1922 running = task_running(rq, p);
1810 on_rq = p->se.on_rq; 1923 on_rq = p->se.on_rq;
1924 ncsw = 0;
1925 if (!match_state || p->state == match_state) {
1926 ncsw = p->nivcsw + p->nvcsw;
1927 if (unlikely(!ncsw))
1928 ncsw = 1;
1929 }
1811 task_rq_unlock(rq, &flags); 1930 task_rq_unlock(rq, &flags);
1812 1931
1813 /* 1932 /*
1933 * If it changed from the expected state, bail out now.
1934 */
1935 if (unlikely(!ncsw))
1936 break;
1937
1938 /*
1814 * Was it really running after all now that we 1939 * Was it really running after all now that we
1815 * checked with the proper locks actually held? 1940 * checked with the proper locks actually held?
1816 * 1941 *
@@ -1842,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
1842 */ 1967 */
1843 break; 1968 break;
1844 } 1969 }
1970
1971 return ncsw;
1845} 1972}
1846 1973
1847/*** 1974/***
@@ -1880,7 +2007,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 2007 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 2008 unsigned long total = weighted_cpuload(cpu);
1882 2009
1883 if (type == 0) 2010 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 2011 return total;
1885 2012
1886 return min(rq->cpu_load[type-1], total); 2013 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2022,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 2022 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 2023 unsigned long total = weighted_cpuload(cpu);
1897 2024
1898 if (type == 0) 2025 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2026 return total;
1900 2027
1901 return max(rq->cpu_load[type-1], total); 2028 return max(rq->cpu_load[type-1], total);
1902} 2029}
1903 2030
1904/* 2031/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2032 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2033 * domain.
1919 */ 2034 */
@@ -1939,7 +2054,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1939 /* Tally up the load of all CPUs in the group */ 2054 /* Tally up the load of all CPUs in the group */
1940 avg_load = 0; 2055 avg_load = 0;
1941 2056
1942 for_each_cpu_mask(i, group->cpumask) { 2057 for_each_cpu_mask_nr(i, group->cpumask) {
1943 /* Bias balancing toward cpus of our domain */ 2058 /* Bias balancing toward cpus of our domain */
1944 if (local_group) 2059 if (local_group)
1945 load = source_load(i, load_idx); 2060 load = source_load(i, load_idx);
@@ -1981,7 +2096,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1981 /* Traverse only the allowed CPUs */ 2096 /* Traverse only the allowed CPUs */
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2097 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983 2098
1984 for_each_cpu_mask(i, *tmp) { 2099 for_each_cpu_mask_nr(i, *tmp) {
1985 load = weighted_cpuload(i); 2100 load = weighted_cpuload(i);
1986 2101
1987 if (load < min_load || (load == min_load && i == this_cpu)) { 2102 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2019,6 +2134,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2134 sd = tmp;
2020 } 2135 }
2021 2136
2137 if (sd)
2138 update_shares(sd);
2139
2022 while (sd) { 2140 while (sd) {
2023 cpumask_t span, tmpmask; 2141 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2142 struct sched_group *group;
@@ -2085,6 +2203,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2203 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2204 sync = 0;
2087 2205
2206#ifdef CONFIG_SMP
2207 if (sched_feat(LB_WAKEUP_UPDATE)) {
2208 struct sched_domain *sd;
2209
2210 this_cpu = raw_smp_processor_id();
2211 cpu = task_cpu(p);
2212
2213 for_each_domain(this_cpu, sd) {
2214 if (cpu_isset(cpu, sd->span)) {
2215 update_shares(sd);
2216 break;
2217 }
2218 }
2219 }
2220#endif
2221
2088 smp_wmb(); 2222 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2223 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2224 old_state = p->state;
@@ -2131,7 +2265,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2265 }
2132 } 2266 }
2133 } 2267 }
2134#endif 2268#endif /* CONFIG_SCHEDSTATS */
2135 2269
2136out_activate: 2270out_activate:
2137#endif /* CONFIG_SMP */ 2271#endif /* CONFIG_SMP */
@@ -2149,6 +2283,9 @@ out_activate:
2149 success = 1; 2283 success = 1;
2150 2284
2151out_running: 2285out_running:
2286 trace_mark(kernel_sched_wakeup,
2287 "pid %d state %ld ## rq %p task %p rq->curr %p",
2288 p->pid, p->state, rq, p, rq->curr);
2152 check_preempt_curr(rq, p); 2289 check_preempt_curr(rq, p);
2153 2290
2154 p->state = TASK_RUNNING; 2291 p->state = TASK_RUNNING;
@@ -2157,6 +2294,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2294 p->sched_class->task_wake_up(rq, p);
2158#endif 2295#endif
2159out: 2296out:
2297 current->se.last_wakeup = current->se.sum_exec_runtime;
2298
2160 task_rq_unlock(rq, &flags); 2299 task_rq_unlock(rq, &flags);
2161 2300
2162 return success; 2301 return success;
@@ -2277,8 +2416,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2416 * management (if any):
2278 */ 2417 */
2279 p->sched_class->task_new(rq, p); 2418 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2419 inc_nr_running(rq);
2281 } 2420 }
2421 trace_mark(kernel_sched_wakeup_new,
2422 "pid %d state %ld ## rq %p task %p rq->curr %p",
2423 p->pid, p->state, rq, p, rq->curr);
2282 check_preempt_curr(rq, p); 2424 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP 2425#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up) 2426 if (p->sched_class->task_wake_up)
@@ -2331,7 +2473,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2473 notifier->ops->sched_out(notifier, next);
2332} 2474}
2333 2475
2334#else 2476#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2477
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2478static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2479{
@@ -2343,7 +2485,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2485{
2344} 2486}
2345 2487
2346#endif 2488#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2489
2348/** 2490/**
2349 * prepare_task_switch - prepare to switch tasks 2491 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2593,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2451 struct mm_struct *mm, *oldmm; 2593 struct mm_struct *mm, *oldmm;
2452 2594
2453 prepare_task_switch(rq, prev, next); 2595 prepare_task_switch(rq, prev, next);
2596 trace_mark(kernel_sched_schedule,
2597 "prev_pid %d next_pid %d prev_state %ld "
2598 "## rq %p prev %p next %p",
2599 prev->pid, next->pid, prev->state,
2600 rq, prev, next);
2454 mm = next->mm; 2601 mm = next->mm;
2455 oldmm = prev->active_mm; 2602 oldmm = prev->active_mm;
2456 /* 2603 /*
@@ -2680,7 +2827,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2680 2827
2681 rq = task_rq_lock(p, &flags); 2828 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2829 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu))) 2830 || unlikely(!cpu_active(dest_cpu)))
2684 goto out; 2831 goto out;
2685 2832
2686 /* force the process onto the specified CPU */ 2833 /* force the process onto the specified CPU */
@@ -2785,7 +2932,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2932 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2933 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2934{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2935 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2936 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2937 long rem_load_move = max_load_move;
2791 2938
@@ -2801,14 +2948,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 2948next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 2949 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 2950 goto out;
2804 /* 2951
2805 * To help distribute high priority tasks across CPUs we don't 2952 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2953 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 2954 p = iterator->next(iterator->arg);
2814 goto next; 2955 goto next;
@@ -2863,6 +3004,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 3004 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 3005 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 3006 class = class->next;
3007
3008 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3009 break;
3010
2866 } while (class && max_load_move > total_load_moved); 3011 } while (class && max_load_move > total_load_moved);
2867 3012
2868 return total_load_moved > 0; 3013 return total_load_moved > 0;
@@ -2939,6 +3084,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3084 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3085 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3086 this_load_per_task = this_nr_running = 0;
3087
2942 if (idle == CPU_NOT_IDLE) 3088 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3089 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3090 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3099,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3099 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3100 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3101 unsigned long sum_nr_running, sum_weighted_load;
3102 unsigned long sum_avg_load_per_task;
3103 unsigned long avg_load_per_task;
2956 3104
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3105 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3106
@@ -2961,10 +3109,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3109
2962 /* Tally up the load of all CPUs in the group */ 3110 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3111 sum_weighted_load = sum_nr_running = avg_load = 0;
3112 sum_avg_load_per_task = avg_load_per_task = 0;
3113
2964 max_cpu_load = 0; 3114 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3115 min_cpu_load = ~0UL;
2966 3116
2967 for_each_cpu_mask(i, group->cpumask) { 3117 for_each_cpu_mask_nr(i, group->cpumask) {
2968 struct rq *rq; 3118 struct rq *rq;
2969 3119
2970 if (!cpu_isset(i, *cpus)) 3120 if (!cpu_isset(i, *cpus))
@@ -2994,6 +3144,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3144 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3145 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3146 sum_weighted_load += weighted_cpuload(i);
3147
3148 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3149 }
2998 3150
2999 /* 3151 /*
@@ -3015,7 +3167,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3167 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3168 avg_load * SCHED_LOAD_SCALE);
3017 3169
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3170
3171 /*
3172 * Consider the group unbalanced when the imbalance is larger
3173 * than the average weight of two tasks.
3174 *
3175 * APZ: with cgroup the avg task weight can vary wildly and
3176 * might not be a suitable number - should we keep a
3177 * normalized nr_running number somewhere that negates
3178 * the hierarchy?
3179 */
3180 avg_load_per_task = sg_div_cpu_power(group,
3181 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3182
3183 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3184 __group_imb = 1;
3020 3185
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3186 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3321,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3321 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3322 imbn = 1;
3158 } else 3323 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3324 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3325
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3326 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3327 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3328 *imbalance = busiest_load_per_task;
3164 return busiest; 3329 return busiest;
@@ -3228,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3228 unsigned long max_load = 0; 3393 unsigned long max_load = 0;
3229 int i; 3394 int i;
3230 3395
3231 for_each_cpu_mask(i, group->cpumask) { 3396 for_each_cpu_mask_nr(i, group->cpumask) {
3232 unsigned long wl; 3397 unsigned long wl;
3233 3398
3234 if (!cpu_isset(i, *cpus)) 3399 if (!cpu_isset(i, *cpus))
@@ -3284,6 +3449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3449 schedstat_inc(sd, lb_count[idle]);
3285 3450
3286redo: 3451redo:
3452 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3453 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3454 cpus, balance);
3289 3455
@@ -3386,8 +3552,9 @@ redo:
3386 3552
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3553 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3554 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3555 ld_moved = -1;
3390 return ld_moved; 3556
3557 goto out;
3391 3558
3392out_balanced: 3559out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3560 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3569,13 @@ out_one_pinned:
3402 3569
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3570 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3571 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3572 ld_moved = -1;
3406 return 0; 3573 else
3574 ld_moved = 0;
3575out:
3576 if (ld_moved)
3577 update_shares(sd);
3578 return ld_moved;
3407} 3579}
3408 3580
3409/* 3581/*
@@ -3438,6 +3610,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3610
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3611 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3612redo:
3613 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3614 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3615 &sd_idle, cpus, NULL);
3443 if (!group) { 3616 if (!group) {
@@ -3481,6 +3654,7 @@ redo:
3481 } else 3654 } else
3482 sd->nr_balance_failed = 0; 3655 sd->nr_balance_failed = 0;
3483 3656
3657 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3658 return ld_moved;
3485 3659
3486out_balanced: 3660out_balanced:
@@ -3621,7 +3795,7 @@ int select_nohz_load_balancer(int stop_tick)
3621 /* 3795 /*
3622 * If we are going offline and still the leader, give up! 3796 * If we are going offline and still the leader, give up!
3623 */ 3797 */
3624 if (cpu_is_offline(cpu) && 3798 if (!cpu_active(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) { 3799 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3800 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG(); 3801 BUG();
@@ -3672,6 +3846,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3846 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3847 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3848 int update_next_balance = 0;
3849 int need_serialize;
3675 cpumask_t tmp; 3850 cpumask_t tmp;
3676 3851
3677 for_each_domain(cpu, sd) { 3852 for_each_domain(cpu, sd) {
@@ -3689,8 +3864,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3864 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3865 interval = HZ*NR_CPUS/10;
3691 3866
3867 need_serialize = sd->flags & SD_SERIALIZE;
3692 3868
3693 if (sd->flags & SD_SERIALIZE) { 3869 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3870 if (!spin_trylock(&balancing))
3695 goto out; 3871 goto out;
3696 } 3872 }
@@ -3706,7 +3882,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3882 }
3707 sd->last_balance = jiffies; 3883 sd->last_balance = jiffies;
3708 } 3884 }
3709 if (sd->flags & SD_SERIALIZE) 3885 if (need_serialize)
3710 spin_unlock(&balancing); 3886 spin_unlock(&balancing);
3711out: 3887out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3888 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3759,7 +3935,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3759 int balance_cpu; 3935 int balance_cpu;
3760 3936
3761 cpu_clear(this_cpu, cpus); 3937 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) { 3938 for_each_cpu_mask_nr(balance_cpu, cpus) {
3763 /* 3939 /*
3764 * If this cpu gets work to do, stop the load balancing 3940 * If this cpu gets work to do, stop the load balancing
3765 * work being done for other cpus. Next load 3941 * work being done for other cpus. Next load
@@ -3895,6 +4071,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3895 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4071 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3896 else 4072 else
3897 cpustat->user = cputime64_add(cpustat->user, tmp); 4073 cpustat->user = cputime64_add(cpustat->user, tmp);
4074 /* Account for user time used */
4075 acct_update_integrals(p);
3898} 4076}
3899 4077
3900/* 4078/*
@@ -4021,26 +4199,44 @@ void scheduler_tick(void)
4021#endif 4199#endif
4022} 4200}
4023 4201
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4202#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4203 defined(CONFIG_PREEMPT_TRACER))
4204
4205static inline unsigned long get_parent_ip(unsigned long addr)
4206{
4207 if (in_lock_functions(addr)) {
4208 addr = CALLER_ADDR2;
4209 if (in_lock_functions(addr))
4210 addr = CALLER_ADDR3;
4211 }
4212 return addr;
4213}
4025 4214
4026void __kprobes add_preempt_count(int val) 4215void __kprobes add_preempt_count(int val)
4027{ 4216{
4217#ifdef CONFIG_DEBUG_PREEMPT
4028 /* 4218 /*
4029 * Underflow? 4219 * Underflow?
4030 */ 4220 */
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4221 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return; 4222 return;
4223#endif
4033 preempt_count() += val; 4224 preempt_count() += val;
4225#ifdef CONFIG_DEBUG_PREEMPT
4034 /* 4226 /*
4035 * Spinlock count overflowing soon? 4227 * Spinlock count overflowing soon?
4036 */ 4228 */
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4229 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10); 4230 PREEMPT_MASK - 10);
4231#endif
4232 if (preempt_count() == val)
4233 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4039} 4234}
4040EXPORT_SYMBOL(add_preempt_count); 4235EXPORT_SYMBOL(add_preempt_count);
4041 4236
4042void __kprobes sub_preempt_count(int val) 4237void __kprobes sub_preempt_count(int val)
4043{ 4238{
4239#ifdef CONFIG_DEBUG_PREEMPT
4044 /* 4240 /*
4045 * Underflow? 4241 * Underflow?
4046 */ 4242 */
@@ -4052,7 +4248,10 @@ void __kprobes sub_preempt_count(int val)
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4248 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK))) 4249 !(preempt_count() & PREEMPT_MASK)))
4054 return; 4250 return;
4251#endif
4055 4252
4253 if (preempt_count() == val)
4254 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4056 preempt_count() -= val; 4255 preempt_count() -= val;
4057} 4256}
4058EXPORT_SYMBOL(sub_preempt_count); 4257EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4269,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4269 prev->comm, prev->pid, preempt_count());
4071 4270
4072 debug_show_held_locks(prev); 4271 debug_show_held_locks(prev);
4272 print_modules();
4073 if (irqs_disabled()) 4273 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4274 print_irqtrace_events(prev);
4075 4275
@@ -4158,7 +4358,8 @@ need_resched_nonpreemptible:
4158 4358
4159 schedule_debug(prev); 4359 schedule_debug(prev);
4160 4360
4161 hrtick_clear(rq); 4361 if (sched_feat(HRTICK))
4362 hrtick_clear(rq);
4162 4363
4163 /* 4364 /*
4164 * Do the rq-clock update outside the rq lock: 4365 * Do the rq-clock update outside the rq lock:
@@ -4204,8 +4405,6 @@ need_resched_nonpreemptible:
4204 } else 4405 } else
4205 spin_unlock_irq(&rq->lock); 4406 spin_unlock_irq(&rq->lock);
4206 4407
4207 hrtick_set(rq);
4208
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4408 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4409 goto need_resched_nonpreemptible;
4211 4410
@@ -4586,10 +4785,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4785 goto out_unlock;
4587 } 4786 }
4588 on_rq = p->se.on_rq; 4787 on_rq = p->se.on_rq;
4589 if (on_rq) { 4788 if (on_rq)
4590 dequeue_task(rq, p, 0); 4789 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4790
4594 p->static_prio = NICE_TO_PRIO(nice); 4791 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4792 set_load_weight(p);
@@ -4599,7 +4796,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4796
4600 if (on_rq) { 4797 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4798 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4799 /*
4604 * If the task increased its priority or is running and 4800 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4801 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +4940,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 4940 set_load_weight(p);
4745} 4941}
4746 4942
4747/** 4943static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4944 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 4945{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 4946 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 4947 unsigned long flags;
@@ -4785,7 +4973,7 @@ recheck:
4785 /* 4973 /*
4786 * Allow unprivileged RT tasks to decrease priority: 4974 * Allow unprivileged RT tasks to decrease priority:
4787 */ 4975 */
4788 if (!capable(CAP_SYS_NICE)) { 4976 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 4977 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 4978 unsigned long rlim_rtprio;
4791 4979
@@ -4821,7 +5009,8 @@ recheck:
4821 * Do not allow realtime tasks into groups that have no runtime 5009 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 5010 * assigned.
4823 */ 5011 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5012 if (user
5013 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4825 return -EPERM; 5014 return -EPERM;
4826#endif 5015#endif
4827 5016
@@ -4870,8 +5059,39 @@ recheck:
4870 5059
4871 return 0; 5060 return 0;
4872} 5061}
5062
5063/**
5064 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5065 * @p: the task in question.
5066 * @policy: new policy.
5067 * @param: structure containing the new RT priority.
5068 *
5069 * NOTE that the task may be already dead.
5070 */
5071int sched_setscheduler(struct task_struct *p, int policy,
5072 struct sched_param *param)
5073{
5074 return __sched_setscheduler(p, policy, param, true);
5075}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5076EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5077
5078/**
5079 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5080 * @p: the task in question.
5081 * @policy: new policy.
5082 * @param: structure containing the new RT priority.
5083 *
5084 * Just like sched_setscheduler, only don't bother checking if the
5085 * current context has permission. For example, this is needed in
5086 * stop_machine(): we create temporary high priority worker threads,
5087 * but our caller might not have that capability.
5088 */
5089int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5090 struct sched_param *param)
5091{
5092 return __sched_setscheduler(p, policy, param, false);
5093}
5094
4875static int 5095static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5096do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5097{
@@ -5070,24 +5290,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5290 return sched_setaffinity(pid, &new_mask);
5071} 5291}
5072 5292
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5293long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5294{
5093 struct task_struct *p; 5295 struct task_struct *p;
@@ -5384,7 +5586,7 @@ out_unlock:
5384 return retval; 5586 return retval;
5385} 5587}
5386 5588
5387static const char stat_nam[] = "RSDTtZX"; 5589static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5590
5389void sched_show_task(struct task_struct *p) 5591void sched_show_task(struct task_struct *p)
5390{ 5592{
@@ -5571,6 +5773,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5773 goto out;
5572 } 5774 }
5573 5775
5776 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5777 !cpus_equal(p->cpus_allowed, *new_mask))) {
5778 ret = -EINVAL;
5779 goto out;
5780 }
5781
5574 if (p->sched_class->set_cpus_allowed) 5782 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5783 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5784 else {
@@ -5613,7 +5821,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5613 struct rq *rq_dest, *rq_src; 5821 struct rq *rq_dest, *rq_src;
5614 int ret = 0, on_rq; 5822 int ret = 0, on_rq;
5615 5823
5616 if (unlikely(cpu_is_offline(dest_cpu))) 5824 if (unlikely(!cpu_active(dest_cpu)))
5617 return ret; 5825 return ret;
5618 5826
5619 rq_src = cpu_rq(src_cpu); 5827 rq_src = cpu_rq(src_cpu);
@@ -6060,6 +6268,36 @@ static void unregister_sched_domain_sysctl(void)
6060} 6268}
6061#endif 6269#endif
6062 6270
6271static void set_rq_online(struct rq *rq)
6272{
6273 if (!rq->online) {
6274 const struct sched_class *class;
6275
6276 cpu_set(rq->cpu, rq->rd->online);
6277 rq->online = 1;
6278
6279 for_each_class(class) {
6280 if (class->rq_online)
6281 class->rq_online(rq);
6282 }
6283 }
6284}
6285
6286static void set_rq_offline(struct rq *rq)
6287{
6288 if (rq->online) {
6289 const struct sched_class *class;
6290
6291 for_each_class(class) {
6292 if (class->rq_offline)
6293 class->rq_offline(rq);
6294 }
6295
6296 cpu_clear(rq->cpu, rq->rd->online);
6297 rq->online = 0;
6298 }
6299}
6300
6063/* 6301/*
6064 * migration_call - callback that gets triggered when a CPU is added. 6302 * migration_call - callback that gets triggered when a CPU is added.
6065 * Here we can start up the necessary migration thread for the new CPU. 6303 * Here we can start up the necessary migration thread for the new CPU.
@@ -6097,7 +6335,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6097 spin_lock_irqsave(&rq->lock, flags); 6335 spin_lock_irqsave(&rq->lock, flags);
6098 if (rq->rd) { 6336 if (rq->rd) {
6099 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6337 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6100 cpu_set(cpu, rq->rd->online); 6338
6339 set_rq_online(rq);
6101 } 6340 }
6102 spin_unlock_irqrestore(&rq->lock, flags); 6341 spin_unlock_irqrestore(&rq->lock, flags);
6103 break; 6342 break;
@@ -6158,7 +6397,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6158 spin_lock_irqsave(&rq->lock, flags); 6397 spin_lock_irqsave(&rq->lock, flags);
6159 if (rq->rd) { 6398 if (rq->rd) {
6160 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6399 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6161 cpu_clear(cpu, rq->rd->online); 6400 set_rq_offline(rq);
6162 } 6401 }
6163 spin_unlock_irqrestore(&rq->lock, flags); 6402 spin_unlock_irqrestore(&rq->lock, flags);
6164 break; 6403 break;
@@ -6175,7 +6414,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
6175 .priority = 10 6414 .priority = 10
6176}; 6415};
6177 6416
6178void __init migration_init(void) 6417static int __init migration_init(void)
6179{ 6418{
6180 void *cpu = (void *)(long)smp_processor_id(); 6419 void *cpu = (void *)(long)smp_processor_id();
6181 int err; 6420 int err;
@@ -6185,13 +6424,38 @@ void __init migration_init(void)
6185 BUG_ON(err == NOTIFY_BAD); 6424 BUG_ON(err == NOTIFY_BAD);
6186 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6425 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6187 register_cpu_notifier(&migration_notifier); 6426 register_cpu_notifier(&migration_notifier);
6427
6428 return err;
6188} 6429}
6430early_initcall(migration_init);
6189#endif 6431#endif
6190 6432
6191#ifdef CONFIG_SMP 6433#ifdef CONFIG_SMP
6192 6434
6193#ifdef CONFIG_SCHED_DEBUG 6435#ifdef CONFIG_SCHED_DEBUG
6194 6436
6437static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6438{
6439 switch (lvl) {
6440 case SD_LV_NONE:
6441 return "NONE";
6442 case SD_LV_SIBLING:
6443 return "SIBLING";
6444 case SD_LV_MC:
6445 return "MC";
6446 case SD_LV_CPU:
6447 return "CPU";
6448 case SD_LV_NODE:
6449 return "NODE";
6450 case SD_LV_ALLNODES:
6451 return "ALLNODES";
6452 case SD_LV_MAX:
6453 return "MAX";
6454
6455 }
6456 return "MAX";
6457}
6458
6195static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6459static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6196 cpumask_t *groupmask) 6460 cpumask_t *groupmask)
6197{ 6461{
@@ -6211,7 +6475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6211 return -1; 6475 return -1;
6212 } 6476 }
6213 6477
6214 printk(KERN_CONT "span %s\n", str); 6478 printk(KERN_CONT "span %s level %s\n",
6479 str, sd_level_to_string(sd->level));
6215 6480
6216 if (!cpu_isset(cpu, sd->span)) { 6481 if (!cpu_isset(cpu, sd->span)) {
6217 printk(KERN_ERR "ERROR: domain->span does not contain " 6482 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6295,9 +6560,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6295 } 6560 }
6296 kfree(groupmask); 6561 kfree(groupmask);
6297} 6562}
6298#else 6563#else /* !CONFIG_SCHED_DEBUG */
6299# define sched_domain_debug(sd, cpu) do { } while (0) 6564# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif 6565#endif /* CONFIG_SCHED_DEBUG */
6301 6566
6302static int sd_degenerate(struct sched_domain *sd) 6567static int sd_degenerate(struct sched_domain *sd)
6303{ 6568{
@@ -6357,20 +6622,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6357static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6622static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6358{ 6623{
6359 unsigned long flags; 6624 unsigned long flags;
6360 const struct sched_class *class;
6361 6625
6362 spin_lock_irqsave(&rq->lock, flags); 6626 spin_lock_irqsave(&rq->lock, flags);
6363 6627
6364 if (rq->rd) { 6628 if (rq->rd) {
6365 struct root_domain *old_rd = rq->rd; 6629 struct root_domain *old_rd = rq->rd;
6366 6630
6367 for (class = sched_class_highest; class; class = class->next) { 6631 if (cpu_isset(rq->cpu, old_rd->online))
6368 if (class->leave_domain) 6632 set_rq_offline(rq);
6369 class->leave_domain(rq);
6370 }
6371 6633
6372 cpu_clear(rq->cpu, old_rd->span); 6634 cpu_clear(rq->cpu, old_rd->span);
6373 cpu_clear(rq->cpu, old_rd->online);
6374 6635
6375 if (atomic_dec_and_test(&old_rd->refcount)) 6636 if (atomic_dec_and_test(&old_rd->refcount))
6376 kfree(old_rd); 6637 kfree(old_rd);
@@ -6381,12 +6642,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6381 6642
6382 cpu_set(rq->cpu, rd->span); 6643 cpu_set(rq->cpu, rd->span);
6383 if (cpu_isset(rq->cpu, cpu_online_map)) 6644 if (cpu_isset(rq->cpu, cpu_online_map))
6384 cpu_set(rq->cpu, rd->online); 6645 set_rq_online(rq);
6385
6386 for (class = sched_class_highest; class; class = class->next) {
6387 if (class->join_domain)
6388 class->join_domain(rq);
6389 }
6390 6646
6391 spin_unlock_irqrestore(&rq->lock, flags); 6647 spin_unlock_irqrestore(&rq->lock, flags);
6392} 6648}
@@ -6397,6 +6653,8 @@ static void init_rootdomain(struct root_domain *rd)
6397 6653
6398 cpus_clear(rd->span); 6654 cpus_clear(rd->span);
6399 cpus_clear(rd->online); 6655 cpus_clear(rd->online);
6656
6657 cpupri_init(&rd->cpupri);
6400} 6658}
6401 6659
6402static void init_defrootdomain(void) 6660static void init_defrootdomain(void)
@@ -6458,7 +6716,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6458/* Setup the mask of cpus configured for isolated domains */ 6716/* Setup the mask of cpus configured for isolated domains */
6459static int __init isolated_cpu_setup(char *str) 6717static int __init isolated_cpu_setup(char *str)
6460{ 6718{
6461 int ints[NR_CPUS], i; 6719 static int __initdata ints[NR_CPUS];
6720 int i;
6462 6721
6463 str = get_options(str, ARRAY_SIZE(ints), ints); 6722 str = get_options(str, ARRAY_SIZE(ints), ints);
6464 cpus_clear(cpu_isolated_map); 6723 cpus_clear(cpu_isolated_map);
@@ -6492,7 +6751,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6492 6751
6493 cpus_clear(*covered); 6752 cpus_clear(*covered);
6494 6753
6495 for_each_cpu_mask(i, *span) { 6754 for_each_cpu_mask_nr(i, *span) {
6496 struct sched_group *sg; 6755 struct sched_group *sg;
6497 int group = group_fn(i, cpu_map, &sg, tmpmask); 6756 int group = group_fn(i, cpu_map, &sg, tmpmask);
6498 int j; 6757 int j;
@@ -6503,7 +6762,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6503 cpus_clear(sg->cpumask); 6762 cpus_clear(sg->cpumask);
6504 sg->__cpu_power = 0; 6763 sg->__cpu_power = 0;
6505 6764
6506 for_each_cpu_mask(j, *span) { 6765 for_each_cpu_mask_nr(j, *span) {
6507 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6766 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6508 continue; 6767 continue;
6509 6768
@@ -6539,9 +6798,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6539 6798
6540 min_val = INT_MAX; 6799 min_val = INT_MAX;
6541 6800
6542 for (i = 0; i < MAX_NUMNODES; i++) { 6801 for (i = 0; i < nr_node_ids; i++) {
6543 /* Start at @node */ 6802 /* Start at @node */
6544 n = (node + i) % MAX_NUMNODES; 6803 n = (node + i) % nr_node_ids;
6545 6804
6546 if (!nr_cpus_node(n)) 6805 if (!nr_cpus_node(n))
6547 continue; 6806 continue;
@@ -6591,7 +6850,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6591 cpus_or(*span, *span, *nodemask); 6850 cpus_or(*span, *span, *nodemask);
6592 } 6851 }
6593} 6852}
6594#endif 6853#endif /* CONFIG_NUMA */
6595 6854
6596int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6855int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6597 6856
@@ -6610,7 +6869,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6610 *sg = &per_cpu(sched_group_cpus, cpu); 6869 *sg = &per_cpu(sched_group_cpus, cpu);
6611 return cpu; 6870 return cpu;
6612} 6871}
6613#endif 6872#endif /* CONFIG_SCHED_SMT */
6614 6873
6615/* 6874/*
6616 * multi-core sched-domains: 6875 * multi-core sched-domains:
@@ -6618,7 +6877,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6618#ifdef CONFIG_SCHED_MC 6877#ifdef CONFIG_SCHED_MC
6619static DEFINE_PER_CPU(struct sched_domain, core_domains); 6878static DEFINE_PER_CPU(struct sched_domain, core_domains);
6620static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6879static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6621#endif 6880#endif /* CONFIG_SCHED_MC */
6622 6881
6623#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6882#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6624static int 6883static int
@@ -6703,7 +6962,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6703 if (!sg) 6962 if (!sg)
6704 return; 6963 return;
6705 do { 6964 do {
6706 for_each_cpu_mask(j, sg->cpumask) { 6965 for_each_cpu_mask_nr(j, sg->cpumask) {
6707 struct sched_domain *sd; 6966 struct sched_domain *sd;
6708 6967
6709 sd = &per_cpu(phys_domains, j); 6968 sd = &per_cpu(phys_domains, j);
@@ -6720,7 +6979,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6720 sg = sg->next; 6979 sg = sg->next;
6721 } while (sg != group_head); 6980 } while (sg != group_head);
6722} 6981}
6723#endif 6982#endif /* CONFIG_NUMA */
6724 6983
6725#ifdef CONFIG_NUMA 6984#ifdef CONFIG_NUMA
6726/* Free memory allocated for various sched_group structures */ 6985/* Free memory allocated for various sched_group structures */
@@ -6728,14 +6987,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6728{ 6987{
6729 int cpu, i; 6988 int cpu, i;
6730 6989
6731 for_each_cpu_mask(cpu, *cpu_map) { 6990 for_each_cpu_mask_nr(cpu, *cpu_map) {
6732 struct sched_group **sched_group_nodes 6991 struct sched_group **sched_group_nodes
6733 = sched_group_nodes_bycpu[cpu]; 6992 = sched_group_nodes_bycpu[cpu];
6734 6993
6735 if (!sched_group_nodes) 6994 if (!sched_group_nodes)
6736 continue; 6995 continue;
6737 6996
6738 for (i = 0; i < MAX_NUMNODES; i++) { 6997 for (i = 0; i < nr_node_ids; i++) {
6739 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6998 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6740 6999
6741 *nodemask = node_to_cpumask(i); 7000 *nodemask = node_to_cpumask(i);
@@ -6757,11 +7016,11 @@ next_sg:
6757 sched_group_nodes_bycpu[cpu] = NULL; 7016 sched_group_nodes_bycpu[cpu] = NULL;
6758 } 7017 }
6759} 7018}
6760#else 7019#else /* !CONFIG_NUMA */
6761static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7020static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6762{ 7021{
6763} 7022}
6764#endif 7023#endif /* CONFIG_NUMA */
6765 7024
6766/* 7025/*
6767 * Initialize sched groups cpu_power. 7026 * Initialize sched groups cpu_power.
@@ -6928,7 +7187,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6928 /* 7187 /*
6929 * Allocate the per-node list of sched groups 7188 * Allocate the per-node list of sched groups
6930 */ 7189 */
6931 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7190 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6932 GFP_KERNEL); 7191 GFP_KERNEL);
6933 if (!sched_group_nodes) { 7192 if (!sched_group_nodes) {
6934 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7193 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6967,7 +7226,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6967 /* 7226 /*
6968 * Set up domains for cpus specified by the cpu_map. 7227 * Set up domains for cpus specified by the cpu_map.
6969 */ 7228 */
6970 for_each_cpu_mask(i, *cpu_map) { 7229 for_each_cpu_mask_nr(i, *cpu_map) {
6971 struct sched_domain *sd = NULL, *p; 7230 struct sched_domain *sd = NULL, *p;
6972 SCHED_CPUMASK_VAR(nodemask, allmasks); 7231 SCHED_CPUMASK_VAR(nodemask, allmasks);
6973 7232
@@ -7034,7 +7293,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7034 7293
7035#ifdef CONFIG_SCHED_SMT 7294#ifdef CONFIG_SCHED_SMT
7036 /* Set up CPU (sibling) groups */ 7295 /* Set up CPU (sibling) groups */
7037 for_each_cpu_mask(i, *cpu_map) { 7296 for_each_cpu_mask_nr(i, *cpu_map) {
7038 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7297 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7039 SCHED_CPUMASK_VAR(send_covered, allmasks); 7298 SCHED_CPUMASK_VAR(send_covered, allmasks);
7040 7299
@@ -7051,7 +7310,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7051 7310
7052#ifdef CONFIG_SCHED_MC 7311#ifdef CONFIG_SCHED_MC
7053 /* Set up multi-core groups */ 7312 /* Set up multi-core groups */
7054 for_each_cpu_mask(i, *cpu_map) { 7313 for_each_cpu_mask_nr(i, *cpu_map) {
7055 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7314 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7056 SCHED_CPUMASK_VAR(send_covered, allmasks); 7315 SCHED_CPUMASK_VAR(send_covered, allmasks);
7057 7316
@@ -7067,7 +7326,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7067#endif 7326#endif
7068 7327
7069 /* Set up physical groups */ 7328 /* Set up physical groups */
7070 for (i = 0; i < MAX_NUMNODES; i++) { 7329 for (i = 0; i < nr_node_ids; i++) {
7071 SCHED_CPUMASK_VAR(nodemask, allmasks); 7330 SCHED_CPUMASK_VAR(nodemask, allmasks);
7072 SCHED_CPUMASK_VAR(send_covered, allmasks); 7331 SCHED_CPUMASK_VAR(send_covered, allmasks);
7073 7332
@@ -7091,7 +7350,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7091 send_covered, tmpmask); 7350 send_covered, tmpmask);
7092 } 7351 }
7093 7352
7094 for (i = 0; i < MAX_NUMNODES; i++) { 7353 for (i = 0; i < nr_node_ids; i++) {
7095 /* Set up node groups */ 7354 /* Set up node groups */
7096 struct sched_group *sg, *prev; 7355 struct sched_group *sg, *prev;
7097 SCHED_CPUMASK_VAR(nodemask, allmasks); 7356 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7118,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7118 goto error; 7377 goto error;
7119 } 7378 }
7120 sched_group_nodes[i] = sg; 7379 sched_group_nodes[i] = sg;
7121 for_each_cpu_mask(j, *nodemask) { 7380 for_each_cpu_mask_nr(j, *nodemask) {
7122 struct sched_domain *sd; 7381 struct sched_domain *sd;
7123 7382
7124 sd = &per_cpu(node_domains, j); 7383 sd = &per_cpu(node_domains, j);
@@ -7130,9 +7389,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7130 cpus_or(*covered, *covered, *nodemask); 7389 cpus_or(*covered, *covered, *nodemask);
7131 prev = sg; 7390 prev = sg;
7132 7391
7133 for (j = 0; j < MAX_NUMNODES; j++) { 7392 for (j = 0; j < nr_node_ids; j++) {
7134 SCHED_CPUMASK_VAR(notcovered, allmasks); 7393 SCHED_CPUMASK_VAR(notcovered, allmasks);
7135 int n = (i + j) % MAX_NUMNODES; 7394 int n = (i + j) % nr_node_ids;
7136 node_to_cpumask_ptr(pnodemask, n); 7395 node_to_cpumask_ptr(pnodemask, n);
7137 7396
7138 cpus_complement(*notcovered, *covered); 7397 cpus_complement(*notcovered, *covered);
@@ -7164,28 +7423,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7164 7423
7165 /* Calculate CPU power for physical packages and nodes */ 7424 /* Calculate CPU power for physical packages and nodes */
7166#ifdef CONFIG_SCHED_SMT 7425#ifdef CONFIG_SCHED_SMT
7167 for_each_cpu_mask(i, *cpu_map) { 7426 for_each_cpu_mask_nr(i, *cpu_map) {
7168 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7427 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7169 7428
7170 init_sched_groups_power(i, sd); 7429 init_sched_groups_power(i, sd);
7171 } 7430 }
7172#endif 7431#endif
7173#ifdef CONFIG_SCHED_MC 7432#ifdef CONFIG_SCHED_MC
7174 for_each_cpu_mask(i, *cpu_map) { 7433 for_each_cpu_mask_nr(i, *cpu_map) {
7175 struct sched_domain *sd = &per_cpu(core_domains, i); 7434 struct sched_domain *sd = &per_cpu(core_domains, i);
7176 7435
7177 init_sched_groups_power(i, sd); 7436 init_sched_groups_power(i, sd);
7178 } 7437 }
7179#endif 7438#endif
7180 7439
7181 for_each_cpu_mask(i, *cpu_map) { 7440 for_each_cpu_mask_nr(i, *cpu_map) {
7182 struct sched_domain *sd = &per_cpu(phys_domains, i); 7441 struct sched_domain *sd = &per_cpu(phys_domains, i);
7183 7442
7184 init_sched_groups_power(i, sd); 7443 init_sched_groups_power(i, sd);
7185 } 7444 }
7186 7445
7187#ifdef CONFIG_NUMA 7446#ifdef CONFIG_NUMA
7188 for (i = 0; i < MAX_NUMNODES; i++) 7447 for (i = 0; i < nr_node_ids; i++)
7189 init_numa_sched_groups_power(sched_group_nodes[i]); 7448 init_numa_sched_groups_power(sched_group_nodes[i]);
7190 7449
7191 if (sd_allnodes) { 7450 if (sd_allnodes) {
@@ -7198,7 +7457,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7198#endif 7457#endif
7199 7458
7200 /* Attach the domains */ 7459 /* Attach the domains */
7201 for_each_cpu_mask(i, *cpu_map) { 7460 for_each_cpu_mask_nr(i, *cpu_map) {
7202 struct sched_domain *sd; 7461 struct sched_domain *sd;
7203#ifdef CONFIG_SCHED_SMT 7462#ifdef CONFIG_SCHED_SMT
7204 sd = &per_cpu(cpu_domains, i); 7463 sd = &per_cpu(cpu_domains, i);
@@ -7243,18 +7502,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7243} 7502}
7244 7503
7245/* 7504/*
7246 * Free current domain masks.
7247 * Called after all cpus are attached to NULL domain.
7248 */
7249static void free_sched_domains(void)
7250{
7251 ndoms_cur = 0;
7252 if (doms_cur != &fallback_doms)
7253 kfree(doms_cur);
7254 doms_cur = &fallback_doms;
7255}
7256
7257/*
7258 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7505 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7259 * For now this just excludes isolated cpus, but could be used to 7506 * For now this just excludes isolated cpus, but could be used to
7260 * exclude other special cases in the future. 7507 * exclude other special cases in the future.
@@ -7293,7 +7540,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7293 7540
7294 unregister_sched_domain_sysctl(); 7541 unregister_sched_domain_sysctl();
7295 7542
7296 for_each_cpu_mask(i, *cpu_map) 7543 for_each_cpu_mask_nr(i, *cpu_map)
7297 cpu_attach_domain(NULL, &def_root_domain, i); 7544 cpu_attach_domain(NULL, &def_root_domain, i);
7298 synchronize_sched(); 7545 synchronize_sched();
7299 arch_destroy_sched_domains(cpu_map, &tmpmask); 7546 arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7332,7 +7579,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7332 * ownership of it and will kfree it when done with it. If the caller 7579 * ownership of it and will kfree it when done with it. If the caller
7333 * failed the kmalloc call, then it can pass in doms_new == NULL, 7580 * failed the kmalloc call, then it can pass in doms_new == NULL,
7334 * and partition_sched_domains() will fallback to the single partition 7581 * and partition_sched_domains() will fallback to the single partition
7335 * 'fallback_doms'. 7582 * 'fallback_doms', it also forces the domains to be rebuilt.
7336 * 7583 *
7337 * Call with hotplug lock held 7584 * Call with hotplug lock held
7338 */ 7585 */
@@ -7346,12 +7593,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7346 /* always unregister in case we don't destroy any domains */ 7593 /* always unregister in case we don't destroy any domains */
7347 unregister_sched_domain_sysctl(); 7594 unregister_sched_domain_sysctl();
7348 7595
7349 if (doms_new == NULL) { 7596 if (doms_new == NULL)
7350 ndoms_new = 1; 7597 ndoms_new = 0;
7351 doms_new = &fallback_doms;
7352 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7353 dattr_new = NULL;
7354 }
7355 7598
7356 /* Destroy deleted domains */ 7599 /* Destroy deleted domains */
7357 for (i = 0; i < ndoms_cur; i++) { 7600 for (i = 0; i < ndoms_cur; i++) {
@@ -7366,6 +7609,14 @@ match1:
7366 ; 7609 ;
7367 } 7610 }
7368 7611
7612 if (doms_new == NULL) {
7613 ndoms_cur = 0;
7614 ndoms_new = 1;
7615 doms_new = &fallback_doms;
7616 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7617 dattr_new = NULL;
7618 }
7619
7369 /* Build new domains */ 7620 /* Build new domains */
7370 for (i = 0; i < ndoms_new; i++) { 7621 for (i = 0; i < ndoms_new; i++) {
7371 for (j = 0; j < ndoms_cur; j++) { 7622 for (j = 0; j < ndoms_cur; j++) {
@@ -7396,17 +7647,10 @@ match2:
7396#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7647#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7397int arch_reinit_sched_domains(void) 7648int arch_reinit_sched_domains(void)
7398{ 7649{
7399 int err;
7400
7401 get_online_cpus(); 7650 get_online_cpus();
7402 mutex_lock(&sched_domains_mutex); 7651 rebuild_sched_domains();
7403 detach_destroy_domains(&cpu_online_map);
7404 free_sched_domains();
7405 err = arch_init_sched_domains(&cpu_online_map);
7406 mutex_unlock(&sched_domains_mutex);
7407 put_online_cpus(); 7652 put_online_cpus();
7408 7653 return 0;
7409 return err;
7410} 7654}
7411 7655
7412static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7656static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7427,11 +7671,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7427} 7671}
7428 7672
7429#ifdef CONFIG_SCHED_MC 7673#ifdef CONFIG_SCHED_MC
7430static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 7674static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
7675 struct sysdev_attribute *attr, char *page)
7431{ 7676{
7432 return sprintf(page, "%u\n", sched_mc_power_savings); 7677 return sprintf(page, "%u\n", sched_mc_power_savings);
7433} 7678}
7434static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7679static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
7680 struct sysdev_attribute *attr,
7435 const char *buf, size_t count) 7681 const char *buf, size_t count)
7436{ 7682{
7437 return sched_power_savings_store(buf, count, 0); 7683 return sched_power_savings_store(buf, count, 0);
@@ -7441,11 +7687,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
7441#endif 7687#endif
7442 7688
7443#ifdef CONFIG_SCHED_SMT 7689#ifdef CONFIG_SCHED_SMT
7444static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 7690static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
7691 struct sysdev_attribute *attr, char *page)
7445{ 7692{
7446 return sprintf(page, "%u\n", sched_smt_power_savings); 7693 return sprintf(page, "%u\n", sched_smt_power_savings);
7447} 7694}
7448static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7695static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
7696 struct sysdev_attribute *attr,
7449 const char *buf, size_t count) 7697 const char *buf, size_t count)
7450{ 7698{
7451 return sched_power_savings_store(buf, count, 1); 7699 return sched_power_savings_store(buf, count, 1);
@@ -7470,54 +7718,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7470#endif 7718#endif
7471 return err; 7719 return err;
7472} 7720}
7473#endif 7721#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7474 7722
7723#ifndef CONFIG_CPUSETS
7475/* 7724/*
7476 * Force a reinitialization of the sched domains hierarchy. The domains 7725 * Add online and remove offline CPUs from the scheduler domains.
7477 * and groups cannot be updated in place without racing with the balancing 7726 * When cpusets are enabled they take over this function.
7478 * code, so we temporarily attach all running cpus to the NULL domain
7479 * which will prevent rebalancing while the sched domains are recalculated.
7480 */ 7727 */
7481static int update_sched_domains(struct notifier_block *nfb, 7728static int update_sched_domains(struct notifier_block *nfb,
7482 unsigned long action, void *hcpu) 7729 unsigned long action, void *hcpu)
7483{ 7730{
7484 switch (action) { 7731 switch (action) {
7485 case CPU_UP_PREPARE: 7732 case CPU_ONLINE:
7486 case CPU_UP_PREPARE_FROZEN: 7733 case CPU_ONLINE_FROZEN:
7734 case CPU_DEAD:
7735 case CPU_DEAD_FROZEN:
7736 partition_sched_domains(0, NULL, NULL);
7737 return NOTIFY_OK;
7738
7739 default:
7740 return NOTIFY_DONE;
7741 }
7742}
7743#endif
7744
7745static int update_runtime(struct notifier_block *nfb,
7746 unsigned long action, void *hcpu)
7747{
7748 int cpu = (int)(long)hcpu;
7749
7750 switch (action) {
7487 case CPU_DOWN_PREPARE: 7751 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN: 7752 case CPU_DOWN_PREPARE_FROZEN:
7489 detach_destroy_domains(&cpu_online_map); 7753 disable_runtime(cpu_rq(cpu));
7490 free_sched_domains();
7491 return NOTIFY_OK; 7754 return NOTIFY_OK;
7492 7755
7493 case CPU_UP_CANCELED:
7494 case CPU_UP_CANCELED_FROZEN:
7495 case CPU_DOWN_FAILED: 7756 case CPU_DOWN_FAILED:
7496 case CPU_DOWN_FAILED_FROZEN: 7757 case CPU_DOWN_FAILED_FROZEN:
7497 case CPU_ONLINE: 7758 case CPU_ONLINE:
7498 case CPU_ONLINE_FROZEN: 7759 case CPU_ONLINE_FROZEN:
7499 case CPU_DEAD: 7760 enable_runtime(cpu_rq(cpu));
7500 case CPU_DEAD_FROZEN: 7761 return NOTIFY_OK;
7501 /* 7762
7502 * Fall through and re-initialise the domains.
7503 */
7504 break;
7505 default: 7763 default:
7506 return NOTIFY_DONE; 7764 return NOTIFY_DONE;
7507 } 7765 }
7508
7509#ifndef CONFIG_CPUSETS
7510 /*
7511 * Create default domain partitioning if cpusets are disabled.
7512 * Otherwise we let cpusets rebuild the domains based on the
7513 * current setup.
7514 */
7515
7516 /* The hotplug lock is already held by cpu_up/cpu_down */
7517 arch_init_sched_domains(&cpu_online_map);
7518#endif
7519
7520 return NOTIFY_OK;
7521} 7766}
7522 7767
7523void __init sched_init_smp(void) 7768void __init sched_init_smp(void)
@@ -7537,8 +7782,15 @@ void __init sched_init_smp(void)
7537 cpu_set(smp_processor_id(), non_isolated_cpus); 7782 cpu_set(smp_processor_id(), non_isolated_cpus);
7538 mutex_unlock(&sched_domains_mutex); 7783 mutex_unlock(&sched_domains_mutex);
7539 put_online_cpus(); 7784 put_online_cpus();
7785
7786#ifndef CONFIG_CPUSETS
7540 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7787 /* XXX: Theoretical race here - CPU may be hotplugged now */
7541 hotcpu_notifier(update_sched_domains, 0); 7788 hotcpu_notifier(update_sched_domains, 0);
7789#endif
7790
7791 /* RT runtime code needs to handle some hotplug events */
7792 hotcpu_notifier(update_runtime, 0);
7793
7542 init_hrtick(); 7794 init_hrtick();
7543 7795
7544 /* Move init over to a non-isolated CPU */ 7796 /* Move init over to a non-isolated CPU */
@@ -7695,8 +7947,8 @@ void __init sched_init(void)
7695 7947
7696 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7948 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7697 ptr += nr_cpu_ids * sizeof(void **); 7949 ptr += nr_cpu_ids * sizeof(void **);
7698#endif 7950#endif /* CONFIG_USER_SCHED */
7699#endif 7951#endif /* CONFIG_FAIR_GROUP_SCHED */
7700#ifdef CONFIG_RT_GROUP_SCHED 7952#ifdef CONFIG_RT_GROUP_SCHED
7701 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7953 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7702 ptr += nr_cpu_ids * sizeof(void **); 7954 ptr += nr_cpu_ids * sizeof(void **);
@@ -7710,8 +7962,8 @@ void __init sched_init(void)
7710 7962
7711 root_task_group.rt_rq = (struct rt_rq **)ptr; 7963 root_task_group.rt_rq = (struct rt_rq **)ptr;
7712 ptr += nr_cpu_ids * sizeof(void **); 7964 ptr += nr_cpu_ids * sizeof(void **);
7713#endif 7965#endif /* CONFIG_USER_SCHED */
7714#endif 7966#endif /* CONFIG_RT_GROUP_SCHED */
7715 } 7967 }
7716 7968
7717#ifdef CONFIG_SMP 7969#ifdef CONFIG_SMP
@@ -7727,8 +7979,8 @@ void __init sched_init(void)
7727#ifdef CONFIG_USER_SCHED 7979#ifdef CONFIG_USER_SCHED
7728 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7980 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7729 global_rt_period(), RUNTIME_INF); 7981 global_rt_period(), RUNTIME_INF);
7730#endif 7982#endif /* CONFIG_USER_SCHED */
7731#endif 7983#endif /* CONFIG_RT_GROUP_SCHED */
7732 7984
7733#ifdef CONFIG_GROUP_SCHED 7985#ifdef CONFIG_GROUP_SCHED
7734 list_add(&init_task_group.list, &task_groups); 7986 list_add(&init_task_group.list, &task_groups);
@@ -7738,8 +7990,8 @@ void __init sched_init(void)
7738 INIT_LIST_HEAD(&root_task_group.children); 7990 INIT_LIST_HEAD(&root_task_group.children);
7739 init_task_group.parent = &root_task_group; 7991 init_task_group.parent = &root_task_group;
7740 list_add(&init_task_group.siblings, &root_task_group.children); 7992 list_add(&init_task_group.siblings, &root_task_group.children);
7741#endif 7993#endif /* CONFIG_USER_SCHED */
7742#endif 7994#endif /* CONFIG_GROUP_SCHED */
7743 7995
7744 for_each_possible_cpu(i) { 7996 for_each_possible_cpu(i) {
7745 struct rq *rq; 7997 struct rq *rq;
@@ -7819,6 +8071,7 @@ void __init sched_init(void)
7819 rq->next_balance = jiffies; 8071 rq->next_balance = jiffies;
7820 rq->push_cpu = 0; 8072 rq->push_cpu = 0;
7821 rq->cpu = i; 8073 rq->cpu = i;
8074 rq->online = 0;
7822 rq->migration_thread = NULL; 8075 rq->migration_thread = NULL;
7823 INIT_LIST_HEAD(&rq->migration_queue); 8076 INIT_LIST_HEAD(&rq->migration_queue);
7824 rq_attach_root(rq, &def_root_domain); 8077 rq_attach_root(rq, &def_root_domain);
@@ -7834,7 +8087,7 @@ void __init sched_init(void)
7834#endif 8087#endif
7835 8088
7836#ifdef CONFIG_SMP 8089#ifdef CONFIG_SMP
7837 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8090 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7838#endif 8091#endif
7839 8092
7840#ifdef CONFIG_RT_MUTEXES 8093#ifdef CONFIG_RT_MUTEXES
@@ -8058,7 +8311,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8058{ 8311{
8059 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8312 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8060} 8313}
8061#else 8314#else /* !CONFG_FAIR_GROUP_SCHED */
8062static inline void free_fair_sched_group(struct task_group *tg) 8315static inline void free_fair_sched_group(struct task_group *tg)
8063{ 8316{
8064} 8317}
@@ -8076,7 +8329,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8076static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8329static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8077{ 8330{
8078} 8331}
8079#endif 8332#endif /* CONFIG_FAIR_GROUP_SCHED */
8080 8333
8081#ifdef CONFIG_RT_GROUP_SCHED 8334#ifdef CONFIG_RT_GROUP_SCHED
8082static void free_rt_sched_group(struct task_group *tg) 8335static void free_rt_sched_group(struct task_group *tg)
@@ -8147,7 +8400,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8147{ 8400{
8148 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8401 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8149} 8402}
8150#else 8403#else /* !CONFIG_RT_GROUP_SCHED */
8151static inline void free_rt_sched_group(struct task_group *tg) 8404static inline void free_rt_sched_group(struct task_group *tg)
8152{ 8405{
8153} 8406}
@@ -8165,7 +8418,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8165static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8418static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8166{ 8419{
8167} 8420}
8168#endif 8421#endif /* CONFIG_RT_GROUP_SCHED */
8169 8422
8170#ifdef CONFIG_GROUP_SCHED 8423#ifdef CONFIG_GROUP_SCHED
8171static void free_sched_group(struct task_group *tg) 8424static void free_sched_group(struct task_group *tg)
@@ -8276,17 +8529,14 @@ void sched_move_task(struct task_struct *tsk)
8276 8529
8277 task_rq_unlock(rq, &flags); 8530 task_rq_unlock(rq, &flags);
8278} 8531}
8279#endif 8532#endif /* CONFIG_GROUP_SCHED */
8280 8533
8281#ifdef CONFIG_FAIR_GROUP_SCHED 8534#ifdef CONFIG_FAIR_GROUP_SCHED
8282static void set_se_shares(struct sched_entity *se, unsigned long shares) 8535static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8283{ 8536{
8284 struct cfs_rq *cfs_rq = se->cfs_rq; 8537 struct cfs_rq *cfs_rq = se->cfs_rq;
8285 struct rq *rq = cfs_rq->rq;
8286 int on_rq; 8538 int on_rq;
8287 8539
8288 spin_lock_irq(&rq->lock);
8289
8290 on_rq = se->on_rq; 8540 on_rq = se->on_rq;
8291 if (on_rq) 8541 if (on_rq)
8292 dequeue_entity(cfs_rq, se, 0); 8542 dequeue_entity(cfs_rq, se, 0);
@@ -8296,8 +8546,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8296 8546
8297 if (on_rq) 8547 if (on_rq)
8298 enqueue_entity(cfs_rq, se, 0); 8548 enqueue_entity(cfs_rq, se, 0);
8549}
8299 8550
8300 spin_unlock_irq(&rq->lock); 8551static void set_se_shares(struct sched_entity *se, unsigned long shares)
8552{
8553 struct cfs_rq *cfs_rq = se->cfs_rq;
8554 struct rq *rq = cfs_rq->rq;
8555 unsigned long flags;
8556
8557 spin_lock_irqsave(&rq->lock, flags);
8558 __set_se_shares(se, shares);
8559 spin_unlock_irqrestore(&rq->lock, flags);
8301} 8560}
8302 8561
8303static DEFINE_MUTEX(shares_mutex); 8562static DEFINE_MUTEX(shares_mutex);
@@ -8336,8 +8595,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8336 * w/o tripping rebalance_share or load_balance_fair. 8595 * w/o tripping rebalance_share or load_balance_fair.
8337 */ 8596 */
8338 tg->shares = shares; 8597 tg->shares = shares;
8339 for_each_possible_cpu(i) 8598 for_each_possible_cpu(i) {
8599 /*
8600 * force a rebalance
8601 */
8602 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8340 set_se_shares(tg->se[i], shares); 8603 set_se_shares(tg->se[i], shares);
8604 }
8341 8605
8342 /* 8606 /*
8343 * Enable load balance activity on this group, by inserting it back on 8607 * Enable load balance activity on this group, by inserting it back on
@@ -8376,7 +8640,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8376#ifdef CONFIG_CGROUP_SCHED 8640#ifdef CONFIG_CGROUP_SCHED
8377static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8641static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8378{ 8642{
8379 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8643 struct task_group *tgi, *parent = tg->parent;
8380 unsigned long total = 0; 8644 unsigned long total = 0;
8381 8645
8382 if (!parent) { 8646 if (!parent) {
@@ -8400,7 +8664,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8400 } 8664 }
8401 rcu_read_unlock(); 8665 rcu_read_unlock();
8402 8666
8403 return total + to_ratio(period, runtime) < 8667 return total + to_ratio(period, runtime) <=
8404 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8668 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8405 parent->rt_bandwidth.rt_runtime); 8669 parent->rt_bandwidth.rt_runtime);
8406} 8670}
@@ -8520,16 +8784,21 @@ long sched_group_rt_period(struct task_group *tg)
8520 8784
8521static int sched_rt_global_constraints(void) 8785static int sched_rt_global_constraints(void)
8522{ 8786{
8787 struct task_group *tg = &root_task_group;
8788 u64 rt_runtime, rt_period;
8523 int ret = 0; 8789 int ret = 0;
8524 8790
8791 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8792 rt_runtime = tg->rt_bandwidth.rt_runtime;
8793
8525 mutex_lock(&rt_constraints_mutex); 8794 mutex_lock(&rt_constraints_mutex);
8526 if (!__rt_schedulable(NULL, 1, 0)) 8795 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8527 ret = -EINVAL; 8796 ret = -EINVAL;
8528 mutex_unlock(&rt_constraints_mutex); 8797 mutex_unlock(&rt_constraints_mutex);
8529 8798
8530 return ret; 8799 return ret;
8531} 8800}
8532#else 8801#else /* !CONFIG_RT_GROUP_SCHED */
8533static int sched_rt_global_constraints(void) 8802static int sched_rt_global_constraints(void)
8534{ 8803{
8535 unsigned long flags; 8804 unsigned long flags;
@@ -8547,7 +8816,7 @@ static int sched_rt_global_constraints(void)
8547 8816
8548 return 0; 8817 return 0;
8549} 8818}
8550#endif 8819#endif /* CONFIG_RT_GROUP_SCHED */
8551 8820
8552int sched_rt_handler(struct ctl_table *table, int write, 8821int sched_rt_handler(struct ctl_table *table, int write,
8553 struct file *filp, void __user *buffer, size_t *lenp, 8822 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8655,7 +8924,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8655 8924
8656 return (u64) tg->shares; 8925 return (u64) tg->shares;
8657} 8926}
8658#endif 8927#endif /* CONFIG_FAIR_GROUP_SCHED */
8659 8928
8660#ifdef CONFIG_RT_GROUP_SCHED 8929#ifdef CONFIG_RT_GROUP_SCHED
8661static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8930static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8679,7 +8948,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8679{ 8948{
8680 return sched_group_rt_period(cgroup_tg(cgrp)); 8949 return sched_group_rt_period(cgroup_tg(cgrp));
8681} 8950}
8682#endif 8951#endif /* CONFIG_RT_GROUP_SCHED */
8683 8952
8684static struct cftype cpu_files[] = { 8953static struct cftype cpu_files[] = {
8685#ifdef CONFIG_FAIR_GROUP_SCHED 8954#ifdef CONFIG_FAIR_GROUP_SCHED