aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1063
1 files changed, 651 insertions, 412 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 66b2ed784822..a07cff90d849 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
166#define SCALE_PRIO(x, prio) \ 166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169static inline unsigned int task_timeslice(task_t *p) 169static unsigned int task_timeslice(task_t *p)
170{ 170{
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -260,23 +260,87 @@ struct runqueue {
260 260
261static DEFINE_PER_CPU(struct runqueue, runqueues); 261static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See detach_destroy_domains: synchronize_sched for details.
266 *
267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections.
269 */
263#define for_each_domain(cpu, domain) \ 270#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 271for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
265 272
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 273#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues)) 274#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p)) 275#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 276#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 277
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch 278#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0) 279# define prepare_arch_switch(next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 280#endif
277# define task_running(rq, p) ((rq)->curr == (p)) 281#ifndef finish_arch_switch
282# define finish_arch_switch(prev) do { } while (0)
278#endif 283#endif
279 284
285#ifndef __ARCH_WANT_UNLOCKED_CTXSW
286static inline int task_running(runqueue_t *rq, task_t *p)
287{
288 return rq->curr == p;
289}
290
291static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
292{
293}
294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{
297 spin_unlock_irq(&rq->lock);
298}
299
300#else /* __ARCH_WANT_UNLOCKED_CTXSW */
301static inline int task_running(runqueue_t *rq, task_t *p)
302{
303#ifdef CONFIG_SMP
304 return p->oncpu;
305#else
306 return rq->curr == p;
307#endif
308}
309
310static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
311{
312#ifdef CONFIG_SMP
313 /*
314 * We can optimise this out completely for !SMP, because the
315 * SMP rebalancing from interrupt is the only thing that cares
316 * here.
317 */
318 next->oncpu = 1;
319#endif
320#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
321 spin_unlock_irq(&rq->lock);
322#else
323 spin_unlock(&rq->lock);
324#endif
325}
326
327static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
328{
329#ifdef CONFIG_SMP
330 /*
331 * After ->oncpu is cleared, the task can be moved to a different CPU.
332 * We must ensure this doesn't happen until the switch is completely
333 * finished.
334 */
335 smp_wmb();
336 prev->oncpu = 0;
337#endif
338#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
339 local_irq_enable();
340#endif
341}
342#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
343
280/* 344/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 345 * task_rq_lock - lock the runqueue a given task resides on and disable
282 * interrupts. Note the ordering: we can safely lookup the task_rq without 346 * interrupts. Note the ordering: we can safely lookup the task_rq without
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
309 * bump this up when changing the output format or the meaning of an existing 373 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 374 * format, so that tools can adapt (or abort)
311 */ 375 */
312#define SCHEDSTAT_VERSION 11 376#define SCHEDSTAT_VERSION 12
313 377
314static int show_schedstat(struct seq_file *seq, void *v) 378static int show_schedstat(struct seq_file *seq, void *v)
315{ 379{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
338 402
339#ifdef CONFIG_SMP 403#ifdef CONFIG_SMP
340 /* domain-specific stats */ 404 /* domain-specific stats */
405 preempt_disable();
341 for_each_domain(cpu, sd) { 406 for_each_domain(cpu, sd) {
342 enum idle_type itype; 407 enum idle_type itype;
343 char mask_str[NR_CPUS]; 408 char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
356 sd->lb_nobusyq[itype], 421 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 422 sd->lb_nobusyg[itype]);
358 } 423 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 424 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 425 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 426 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
427 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 428 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 429 }
430 preempt_enable();
364#endif 431#endif
365 } 432 }
366 return 0; 433 return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
414 return rq; 481 return rq;
415} 482}
416 483
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS 484#ifdef CONFIG_SCHEDSTATS
434/* 485/*
435 * Called when a process is dequeued from the active array and given 486 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
622 rq->nr_running++; 673 rq->nr_running++;
623} 674}
624 675
625static void recalc_task_prio(task_t *p, unsigned long long now) 676static int recalc_task_prio(task_t *p, unsigned long long now)
626{ 677{
627 /* Caller must always ensure 'now >= p->timestamp' */ 678 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 679 unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
681 } 732 }
682 } 733 }
683 734
684 p->prio = effective_prio(p); 735 return effective_prio(p);
685} 736}
686 737
687/* 738/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
704 } 755 }
705#endif 756#endif
706 757
707 recalc_task_prio(p, now); 758 p->prio = recalc_task_prio(p, now);
708 759
709 /* 760 /*
710 * This checks to make sure it's not an uninterruptible task 761 * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
782} 833}
783 834
784#ifdef CONFIG_SMP 835#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct { 836typedef struct {
791 struct list_head list; 837 struct list_head list;
792 enum request_type type;
793 838
794 /* For REQ_MOVE_TASK */
795 task_t *task; 839 task_t *task;
796 int dest_cpu; 840 int dest_cpu;
797 841
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done; 842 struct completion done;
802} migration_req_t; 843} migration_req_t;
803 844
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
819 } 860 }
820 861
821 init_completion(&req->done); 862 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p; 863 req->task = p;
824 req->dest_cpu = dest_cpu; 864 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 865 list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 926 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 927 * balance conservatively.
888 */ 928 */
889static inline unsigned long source_load(int cpu) 929static inline unsigned long source_load(int cpu, int type)
890{ 930{
891 runqueue_t *rq = cpu_rq(cpu); 931 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 932 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
933 if (type == 0)
934 return load_now;
893 935
894 return min(rq->cpu_load, load_now); 936 return min(rq->cpu_load[type-1], load_now);
895} 937}
896 938
897/* 939/*
898 * Return a high guess at the load of a migration-target cpu 940 * Return a high guess at the load of a migration-target cpu
899 */ 941 */
900static inline unsigned long target_load(int cpu) 942static inline unsigned long target_load(int cpu, int type)
901{ 943{
902 runqueue_t *rq = cpu_rq(cpu); 944 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0)
947 return load_now;
904 948
905 return max(rq->cpu_load, load_now); 949 return max(rq->cpu_load[type-1], load_now);
906} 950}
907 951
908#endif 952/*
953 * find_idlest_group finds and returns the least busy CPU group within the
954 * domain.
955 */
956static struct sched_group *
957find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
958{
959 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
960 unsigned long min_load = ULONG_MAX, this_load = 0;
961 int load_idx = sd->forkexec_idx;
962 int imbalance = 100 + (sd->imbalance_pct-100)/2;
963
964 do {
965 unsigned long load, avg_load;
966 int local_group;
967 int i;
968
969 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971
972 /* Tally up the load of all CPUs in the group */
973 avg_load = 0;
974
975 for_each_cpu_mask(i, group->cpumask) {
976 /* Bias balancing toward cpus of our domain */
977 if (local_group)
978 load = source_load(i, load_idx);
979 else
980 load = target_load(i, load_idx);
981
982 avg_load += load;
983 }
984
985 /* Adjust by relative CPU power of the group */
986 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
987
988 if (local_group) {
989 this_load = avg_load;
990 this = group;
991 } else if (avg_load < min_load) {
992 min_load = avg_load;
993 idlest = group;
994 }
995 group = group->next;
996 } while (group != sd->groups);
997
998 if (!idlest || 100*this_load < imbalance*min_load)
999 return NULL;
1000 return idlest;
1001}
1002
1003/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu)
1007{
1008 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1;
1010 int i;
1011
1012 for_each_cpu_mask(i, group->cpumask) {
1013 load = source_load(i, 0);
1014
1015 if (load < min_load || (load == min_load && i == this_cpu)) {
1016 min_load = load;
1017 idlest = i;
1018 }
1019 }
1020
1021 return idlest;
1022}
1023
1024/*
1025 * sched_balance_self: balance the current task (running on cpu) in domains
1026 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1027 * SD_BALANCE_EXEC.
1028 *
1029 * Balance, ie. select the least loaded group.
1030 *
1031 * Returns the target CPU number, or the same CPU if no balancing is needed.
1032 *
1033 * preempt must be disabled.
1034 */
1035static int sched_balance_self(int cpu, int flag)
1036{
1037 struct task_struct *t = current;
1038 struct sched_domain *tmp, *sd = NULL;
1039
1040 for_each_domain(cpu, tmp)
1041 if (tmp->flags & flag)
1042 sd = tmp;
1043
1044 while (sd) {
1045 cpumask_t span;
1046 struct sched_group *group;
1047 int new_cpu;
1048 int weight;
1049
1050 span = sd->span;
1051 group = find_idlest_group(sd, t, cpu);
1052 if (!group)
1053 goto nextlevel;
1054
1055 new_cpu = find_idlest_cpu(group, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel;
1058
1059 /* Now try balancing at a lower domain level */
1060 cpu = new_cpu;
1061nextlevel:
1062 sd = NULL;
1063 weight = cpus_weight(span);
1064 for_each_domain(cpu, tmp) {
1065 if (weight <= cpus_weight(tmp->span))
1066 break;
1067 if (tmp->flags & flag)
1068 sd = tmp;
1069 }
1070 /* while loop will break here if sd == NULL */
1071 }
1072
1073 return cpu;
1074}
1075
1076#endif /* CONFIG_SMP */
909 1077
910/* 1078/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 1079 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
927 1095
928 for_each_domain(cpu, sd) { 1096 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 1097 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 1098 cpus_and(tmp, sd->span, p->cpus_allowed);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 1099 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 1100 if (idle_cpu(i))
934 return i; 1101 return i;
935 } 1102 }
936 } 1103 }
937 else break; 1104 else
1105 break;
938 } 1106 }
939 return cpu; 1107 return cpu;
940} 1108}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 1135 runqueue_t *rq;
968#ifdef CONFIG_SMP 1136#ifdef CONFIG_SMP
969 unsigned long load, this_load; 1137 unsigned long load, this_load;
970 struct sched_domain *sd; 1138 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 1139 int new_cpu;
972#endif 1140#endif
973 1141
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 1154 if (unlikely(task_running(rq, p)))
987 goto out_activate; 1155 goto out_activate;
988 1156
989#ifdef CONFIG_SCHEDSTATS 1157 new_cpu = cpu;
1158
990 schedstat_inc(rq, ttwu_cnt); 1159 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 1160 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 1161 schedstat_inc(rq, ttwu_local);
993 } else { 1162 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 1163 }
995 if (cpu_isset(cpu, sd->span)) { 1164
996 schedstat_inc(sd, ttwu_wake_remote); 1165 for_each_domain(this_cpu, sd) {
997 break; 1166 if (cpu_isset(cpu, sd->span)) {
998 } 1167 schedstat_inc(sd, ttwu_wake_remote);
1168 this_sd = sd;
1169 break;
999 } 1170 }
1000 } 1171 }
1001#endif
1002 1172
1003 new_cpu = cpu; 1173 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1174 goto out_set_cpu;
1006 1175
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1176 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1177 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1178 */
1014 if (sync) 1179 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1180 int idx = this_sd->wake_idx;
1181 unsigned int imbalance;
1016 1182
1017 /* Don't pull the task off an idle CPU to a busy one */ 1183 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1184
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1185 load = source_load(cpu, idx);
1186 this_load = target_load(this_cpu, idx);
1022 1187
1023 /* 1188 new_cpu = this_cpu; /* Wake to this CPU if we can */
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1189
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1190 if (this_sd->flags & SD_WAKE_AFFINE) {
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1191 unsigned long tl = this_load;
1037 /* 1192 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1193 * If sync wakeup then subtract the (maximum possible)
1039 * in this domain. 1194 * effect of the currently running task from the load
1195 * of the current CPU:
1040 */ 1196 */
1041 if (cpu_isset(cpu, sd->span)) { 1197 if (sync)
1042 schedstat_inc(sd, ttwu_move_affine); 1198 tl -= SCHED_LOAD_SCALE;
1199
1200 if ((tl <= load &&
1201 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1202 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1203 /*
1204 * This domain has SD_WAKE_AFFINE and
1205 * p is cache cold in this domain, and
1206 * there is no bad imbalance.
1207 */
1208 schedstat_inc(this_sd, ttwu_move_affine);
1043 goto out_set_cpu; 1209 goto out_set_cpu;
1044 } 1210 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1211 }
1046 imbalance*this_load <= 100*load) { 1212
1047 /* 1213 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1214 * Start passive balancing when half the imbalance_pct
1049 * an imbalance. 1215 * limit is reached.
1050 */ 1216 */
1051 if (cpu_isset(cpu, sd->span)) { 1217 if (this_sd->flags & SD_WAKE_BALANCE) {
1052 schedstat_inc(sd, ttwu_move_balance); 1218 if (imbalance*this_load <= 100*load) {
1219 schedstat_inc(this_sd, ttwu_move_balance);
1053 goto out_set_cpu; 1220 goto out_set_cpu;
1054 } 1221 }
1055 } 1222 }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1120 return try_to_wake_up(p, state, 0); 1287 return try_to_wake_up(p, state, 0);
1121} 1288}
1122 1289
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/* 1290/*
1129 * Perform scheduler related setup for a newly forked process p. 1291 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1292 * p is forked by current.
1131 */ 1293 */
1132void fastcall sched_fork(task_t *p) 1294void fastcall sched_fork(task_t *p, int clone_flags)
1133{ 1295{
1296 int cpu = get_cpu();
1297
1298#ifdef CONFIG_SMP
1299 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1300#endif
1301 set_task_cpu(p, cpu);
1302
1134 /* 1303 /*
1135 * We mark the process as running here, but have not actually 1304 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1305 * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
1140 p->state = TASK_RUNNING; 1309 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1310 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1311 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS 1312#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1313 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif 1314#endif
1315#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1316 p->oncpu = 0;
1317#endif
1147#ifdef CONFIG_PREEMPT 1318#ifdef CONFIG_PREEMPT
1148 /* 1319 /* Want to start with kernel preemption disabled. */
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1; 1320 p->thread_info->preempt_count = 1;
1155#endif 1321#endif
1156 /* 1322 /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
1174 * runqueue lock is not a problem. 1340 * runqueue lock is not a problem.
1175 */ 1341 */
1176 current->time_slice = 1; 1342 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick(); 1343 scheduler_tick();
1179 local_irq_enable(); 1344 }
1180 preempt_enable(); 1345 local_irq_enable();
1181 } else 1346 put_cpu();
1182 local_irq_enable();
1183} 1347}
1184 1348
1185/* 1349/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1196 runqueue_t *rq, *this_rq; 1360 runqueue_t *rq, *this_rq;
1197 1361
1198 rq = task_rq_lock(p, &flags); 1362 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING); 1363 BUG_ON(p->state != TASK_RUNNING);
1364 this_cpu = smp_processor_id();
1365 cpu = task_cpu(p);
1203 1366
1204 /* 1367 /*
1205 * We decrease the sleep average of forking parents 1368 * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
1296} 1459}
1297 1460
1298/** 1461/**
1462 * prepare_task_switch - prepare to switch tasks
1463 * @rq: the runqueue preparing to switch
1464 * @next: the task we are going to switch to.
1465 *
1466 * This is called with the rq lock held and interrupts off. It must
1467 * be paired with a subsequent finish_task_switch after the context
1468 * switch.
1469 *
1470 * prepare_task_switch sets up locking and calls architecture specific
1471 * hooks.
1472 */
1473static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1474{
1475 prepare_lock_switch(rq, next);
1476 prepare_arch_switch(next);
1477}
1478
1479/**
1299 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1481 * @prev: the thread we just switched away from.
1301 * 1482 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1483 * finish_task_switch must be called after the context switch, paired
1303 * will unlock it along with doing any other architecture-specific cleanup 1484 * with a prepare_task_switch call before the context switch.
1304 * actions. 1485 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1486 * and do any other architecture-specific cleanup actions.
1305 * 1487 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1488 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1489 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1490 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1491 * details.)
1310 */ 1492 */
1311static inline void finish_task_switch(task_t *prev) 1493static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1312 __releases(rq->lock) 1494 __releases(rq->lock)
1313{ 1495{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1496 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1497 unsigned long prev_task_flags;
1317 1498
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
1329 * Manfred Spraul <manfred@colorfullife.com> 1510 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1511 */
1331 prev_task_flags = prev->flags; 1512 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1513 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev);
1333 if (mm) 1515 if (mm)
1334 mmdrop(mm); 1516 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1517 if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
1343asmlinkage void schedule_tail(task_t *prev) 1525asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1526 __releases(rq->lock)
1345{ 1527{
1346 finish_task_switch(prev); 1528 runqueue_t *rq = this_rq();
1347 1529 finish_task_switch(rq, prev);
1530#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1531 /* In this case, finish_task_switch does not reenable preemption */
1532 preempt_enable();
1533#endif
1348 if (current->set_child_tid) 1534 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1535 put_user(current->pid, current->set_child_tid);
1350} 1536}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1494} 1680}
1495 1681
1496/* 1682/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1683 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1684 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1685 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
1571} 1712}
1572 1713
1573/* 1714/*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1715 * sched_exec - execve() is a valuable balancing opportunity, because at
1575 * domain and try to migrate the task to the least loaded CPU. 1716 * this point the task has the smallest effective memory and cache footprint.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1717 */
1580void sched_exec(void) 1718void sched_exec(void)
1581{ 1719{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1720 int new_cpu, this_cpu = get_cpu();
1584 1721 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu(); 1722 put_cpu();
1723 if (new_cpu != this_cpu)
1724 sched_migrate_task(current, new_cpu);
1605} 1725}
1606 1726
1607/* 1727/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1752 */
1633static inline 1753static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1756{
1637 /* 1757 /*
1638 * We do not migrate tasks that are: 1758 * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1760 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1761 * 3) are cache-hot on their current CPU.
1642 */ 1762 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1763 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1764 return 0;
1765 *all_pinned = 0;
1766
1767 if (task_running(rq, p))
1768 return 0;
1647 1769
1648 /* 1770 /*
1649 * Aggressive migration if: 1771 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1772 * 1) task is cache cold, or
1651 * 2) too many balance attempts have failed. 1773 * 2) too many balance attempts have failed.
1652 */ 1774 */
1653 1775
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1776 if (sd->nr_balance_failed > sd->cache_nice_tries)
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1777 return 1;
1657 1778
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1779 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1780 return 0;
1660 return 1; 1781 return 1;
1661} 1782}
1662 1783
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1790 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1791static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1792 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1793 enum idle_type idle, int *all_pinned)
1673{ 1794{
1674 prio_array_t *array, *dst_array; 1795 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1796 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1797 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1798 task_t *tmp;
1678 1799
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1800 if (max_nr_move == 0)
1680 goto out; 1801 goto out;
1681 1802
1803 pinned = 1;
1804
1682 /* 1805 /*
1683 * We first consider expired tasks. Those will likely not be 1806 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1807 * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
1717 1840
1718 curr = curr->prev; 1841 curr = curr->prev;
1719 1842
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1843 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1844 if (curr != head)
1722 goto skip_queue; 1845 goto skip_queue;
1723 idx++; 1846 idx++;
@@ -1746,6 +1869,9 @@ out:
1746 * inside pull_task(). 1869 * inside pull_task().
1747 */ 1870 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1871 schedstat_add(sd, lb_gained[idle], pulled);
1872
1873 if (all_pinned)
1874 *all_pinned = pinned;
1749 return pulled; 1875 return pulled;
1750} 1876}
1751 1877
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1760{ 1886{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1889 int load_idx;
1763 1890
1764 max_load = this_load = total_load = total_pwr = 0; 1891 max_load = this_load = total_load = total_pwr = 0;
1892 if (idle == NOT_IDLE)
1893 load_idx = sd->busy_idx;
1894 else if (idle == NEWLY_IDLE)
1895 load_idx = sd->newidle_idx;
1896 else
1897 load_idx = sd->idle_idx;
1765 1898
1766 do { 1899 do {
1767 unsigned long load; 1900 unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1776 for_each_cpu_mask(i, group->cpumask) { 1909 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1910 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1911 if (local_group)
1779 load = target_load(i); 1912 load = target_load(i, load_idx);
1780 else 1913 else
1781 load = source_load(i); 1914 load = source_load(i, load_idx);
1782 1915
1783 avg_load += load; 1916 avg_load += load;
1784 } 1917 }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1925 if (local_group) {
1793 this_load = avg_load; 1926 this_load = avg_load;
1794 this = group; 1927 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1928 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1929 max_load = avg_load;
1798 busiest = group; 1930 busiest = group;
1799 } 1931 }
1800nextgroup:
1801 group = group->next; 1932 group = group->next;
1802 } while (group != sd->groups); 1933 } while (group != sd->groups);
1803 1934
@@ -1870,15 +2001,9 @@ nextgroup:
1870 2001
1871 /* Get rid of the scaling factor, rounding down as we divide */ 2002 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2003 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest; 2004 return busiest;
1875 2005
1876out_balanced: 2006out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882 2007
1883 *imbalance = 0; 2008 *imbalance = 0;
1884 return NULL; 2009 return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1894 int i; 2019 int i;
1895 2020
1896 for_each_cpu_mask(i, group->cpumask) { 2021 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 2022 load = source_load(i, 0);
1898 2023
1899 if (load > max_load) { 2024 if (load > max_load) {
1900 max_load = load; 2025 max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1906} 2031}
1907 2032
1908/* 2033/*
2034 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2035 * so long as it is large enough.
2036 */
2037#define MAX_PINNED_INTERVAL 512
2038
2039/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2040 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 2041 * tasks if there is an imbalance.
1911 * 2042 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 2048 struct sched_group *group;
1918 runqueue_t *busiest; 2049 runqueue_t *busiest;
1919 unsigned long imbalance; 2050 unsigned long imbalance;
1920 int nr_moved; 2051 int nr_moved, all_pinned = 0;
2052 int active_balance = 0;
1921 2053
1922 spin_lock(&this_rq->lock); 2054 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 2055 schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1934 goto out_balanced; 2066 goto out_balanced;
1935 } 2067 }
1936 2068
1937 /* 2069 BUG_ON(busiest == this_rq);
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946 2070
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 2071 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 2072
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 2080 */
1957 double_lock_balance(this_rq, busiest); 2081 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2082 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 2083 imbalance, sd, idle,
2084 &all_pinned);
1960 spin_unlock(&busiest->lock); 2085 spin_unlock(&busiest->lock);
2086
2087 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned))
2089 goto out_balanced;
1961 } 2090 }
2091
1962 spin_unlock(&this_rq->lock); 2092 spin_unlock(&this_rq->lock);
1963 2093
1964 if (!nr_moved) { 2094 if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 2096 sd->nr_balance_failed++;
1967 2097
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 2099
1971 spin_lock(&busiest->lock); 2100 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 2101 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 2102 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 2103 busiest->push_cpu = this_cpu;
1975 wake = 1; 2104 active_balance = 1;
1976 } 2105 }
1977 spin_unlock(&busiest->lock); 2106 spin_unlock(&busiest->lock);
1978 if (wake) 2107 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 2108 wake_up_process(busiest->migration_thread);
1980 2109
1981 /* 2110 /*
1982 * We've kicked active balancing, reset the failure 2111 * We've kicked active balancing, reset the failure
1983 * counter. 2112 * counter.
1984 */ 2113 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 2114 sd->nr_balance_failed = sd->cache_nice_tries+1;
1986 } 2115 }
1987 2116 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2117 sd->nr_balance_failed = 0;
1996 2118
2119 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2120 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2121 sd->balance_interval = sd->min_interval;
2122 } else {
2123 /*
2124 * If we've begun active balancing, start to back off. This
2125 * case may not be covered by the all_pinned logic if there
2126 * is only 1 task on the busy runqueue (because we don't call
2127 * move_tasks).
2128 */
2129 if (sd->balance_interval < sd->max_interval)
2130 sd->balance_interval *= 2;
1999 } 2131 }
2000 2132
2001 return nr_moved; 2133 return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
2005 2137
2006 schedstat_inc(sd, lb_balanced[idle]); 2138 schedstat_inc(sd, lb_balanced[idle]);
2007 2139
2140 sd->nr_balance_failed = 0;
2008 /* tune up the balancing interval */ 2141 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval))
2010 sd->balance_interval *= 2; 2144 sd->balance_interval *= 2;
2011 2145
2012 return 0; 2146 return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2166 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2168 goto out_balanced;
2036 } 2169 }
2037 2170
2038 busiest = find_busiest_queue(group); 2171 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2172 if (!busiest) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2173 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2174 goto out_balanced;
2043 } 2175 }
2044 2176
2177 BUG_ON(busiest == this_rq);
2178
2045 /* Attempt to move tasks */ 2179 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2180 double_lock_balance(this_rq, busiest);
2047 2181
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2183 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2184 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2185 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else
2188 sd->nr_balance_failed = 0;
2053 2189
2054 spin_unlock(&busiest->lock); 2190 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved; 2191 return nr_moved;
2192
2193out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2195 sd->nr_balance_failed = 0;
2196 return 0;
2058} 2197}
2059 2198
2060/* 2199/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2225static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{ 2226{
2088 struct sched_domain *sd; 2227 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2228 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2229 int target_cpu = busiest_rq->push_cpu;
2092 int cpu; 2230
2231 if (busiest_rq->nr_running <= 1)
2232 /* no task to move */
2233 return;
2234
2235 target_rq = cpu_rq(target_cpu);
2093 2236
2094 /* 2237 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2238 * This condition is "impossible", if it occurs
2096 * domains with SD_LOAD_BALANCE set. 2239 * we need to fix it. Originally reported by
2240 * Bjorn Helgaas on a 128-cpu setup.
2097 */ 2241 */
2098 visited_cpus = CPU_MASK_NONE; 2242 BUG_ON(busiest_rq == target_rq);
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103 2243
2104 schedstat_inc(sd, alb_cnt); 2244 /* move a task from busiest_rq to target_rq */
2245 double_lock_balance(busiest_rq, target_rq);
2105 2246
2106 cpu_group = sd->groups; 2247 /* Search for an sd spanning us and the target CPU. */
2107 do { 2248 for_each_domain(target_cpu, sd)
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2249 if ((sd->flags & SD_LOAD_BALANCE) &&
2109 if (busiest_rq->nr_running <= 1) 2250 cpu_isset(busiest_cpu, sd->span))
2110 /* no more tasks left to move */ 2251 break;
2111 return; 2252
2112 if (cpu_isset(cpu, visited_cpus)) 2253 if (unlikely(sd == NULL))
2113 continue; 2254 goto out;
2114 cpu_set(cpu, visited_cpus); 2255
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2256 schedstat_inc(sd, alb_cnt);
2116 continue; 2257
2117 2258 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2118 target_rq = cpu_rq(cpu); 2259 schedstat_inc(sd, alb_pushed);
2119 /* 2260 else
2120 * This condition is "impossible", if it occurs 2261 schedstat_inc(sd, alb_failed);
2121 * we need to fix it. Originally reported by 2262out:
2122 * Bjorn Helgaas on a 128-cpu setup. 2263 spin_unlock(&target_rq->lock);
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139} 2264}
2140 2265
2141/* 2266/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2156 unsigned long old_load, this_load; 2281 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2282 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2283 struct sched_domain *sd;
2284 int i;
2159 2285
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2286 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2287 /* Update our load */
2164 * Round up the averaging division if load is increasing. This 2288 for (i = 0; i < 3; i++) {
2165 * prevents us from getting stuck on 9 if the load is 10, for 2289 unsigned long new_load = this_load;
2166 * example. 2290 int scale = 1 << i;
2167 */ 2291 old_load = this_rq->cpu_load[i];
2168 if (this_load > old_load) 2292 /*
2169 old_load++; 2293 * Round up the averaging division if load is increasing. This
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2294 * prevents us from getting stuck on 9 if the load is 10, for
2295 * example.
2296 */
2297 if (new_load > old_load)
2298 new_load += scale-1;
2299 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2300 }
2171 2301
2172 for_each_domain(this_cpu, sd) { 2302 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2303 unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
2447#ifdef CONFIG_SCHED_SMT 2577#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{ 2579{
2450 struct sched_domain *sd = this_rq->sd; 2580 struct sched_domain *tmp, *sd = NULL;
2451 cpumask_t sibling_map; 2581 cpumask_t sibling_map;
2452 int i; 2582 int i;
2453 2583
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2584 for_each_domain(this_cpu, tmp)
2585 if (tmp->flags & SD_SHARE_CPUPOWER)
2586 sd = tmp;
2587
2588 if (!sd)
2455 return; 2589 return;
2456 2590
2457 /* 2591 /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2492 2626
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{ 2628{
2495 struct sched_domain *sd = this_rq->sd; 2629 struct sched_domain *tmp, *sd = NULL;
2496 cpumask_t sibling_map; 2630 cpumask_t sibling_map;
2497 prio_array_t *array; 2631 prio_array_t *array;
2498 int ret = 0, i; 2632 int ret = 0, i;
2499 task_t *p; 2633 task_t *p;
2500 2634
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2635 for_each_domain(this_cpu, tmp)
2636 if (tmp->flags & SD_SHARE_CPUPOWER)
2637 sd = tmp;
2638
2639 if (!sd)
2502 return 0; 2640 return 0;
2503 2641
2504 /* 2642 /*
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val)
2576 /* 2714 /*
2577 * Underflow? 2715 * Underflow?
2578 */ 2716 */
2579 BUG_ON(((int)preempt_count() < 0)); 2717 BUG_ON((preempt_count() < 0));
2580 preempt_count() += val; 2718 preempt_count() += val;
2581 /* 2719 /*
2582 * Spinlock count overflowing soon? 2720 * Spinlock count overflowing soon?
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
2613 struct list_head *queue; 2751 struct list_head *queue;
2614 unsigned long long now; 2752 unsigned long long now;
2615 unsigned long run_time; 2753 unsigned long run_time;
2616 int cpu, idx; 2754 int cpu, idx, new_prio;
2617 2755
2618 /* 2756 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2757 * Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2873 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2874
2737 array = next->array; 2875 array = next->array;
2738 dequeue_task(next, array); 2876 new_prio = recalc_task_prio(next, next->timestamp + delta);
2739 recalc_task_prio(next, next->timestamp + delta); 2877
2740 enqueue_task(next, array); 2878 if (unlikely(next->prio != new_prio)) {
2879 dequeue_task(next, array);
2880 next->prio = new_prio;
2881 enqueue_task(next, array);
2882 } else
2883 requeue_task(next, array);
2741 } 2884 }
2742 next->activated = 0; 2885 next->activated = 0;
2743switch_tasks: 2886switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
2761 rq->curr = next; 2904 rq->curr = next;
2762 ++*switch_count; 2905 ++*switch_count;
2763 2906
2764 prepare_arch_switch(rq, next); 2907 prepare_task_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2908 prev = context_switch(rq, prev, next);
2766 barrier(); 2909 barrier();
2767 2910 /*
2768 finish_task_switch(prev); 2911 * this_rq must be evaluated again because prev may have moved
2912 * CPUs since it called schedule(), thus the 'rq' on its stack
2913 * frame will be invalid.
2914 */
2915 finish_task_switch(this_rq(), prev);
2769 } else 2916 } else
2770 spin_unlock_irq(&rq->lock); 2917 spin_unlock_irq(&rq->lock);
2771 2918
@@ -2869,7 +3016,7 @@ need_resched:
2869 3016
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{ 3018{
2872 task_t *p = curr->task; 3019 task_t *p = curr->private;
2873 return try_to_wake_up(p, mode, sync); 3020 return try_to_wake_up(p, mode, sync);
2874} 3021}
2875 3022
@@ -3384,13 +3531,24 @@ recheck:
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3531 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3532 return -EINVAL;
3386 3533
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3534 /*
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3535 * Allow unprivileged RT tasks to decrease priority:
3389 !capable(CAP_SYS_NICE)) 3536 */
3390 return -EPERM; 3537 if (!capable(CAP_SYS_NICE)) {
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3538 /* can't change policy */
3392 !capable(CAP_SYS_NICE)) 3539 if (policy != p->policy)
3393 return -EPERM; 3540 return -EPERM;
3541 /* can't increase priority */
3542 if (policy != SCHED_NORMAL &&
3543 param->sched_priority > p->rt_priority &&
3544 param->sched_priority >
3545 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3546 return -EPERM;
3547 /* can't change other user's priorities */
3548 if ((current->euid != p->euid) &&
3549 (current->euid != p->uid))
3550 return -EPERM;
3551 }
3394 3552
3395 retval = security_task_setscheduler(p, policy, param); 3553 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3554 if (retval)
@@ -3755,19 +3913,22 @@ EXPORT_SYMBOL(cond_resched);
3755 */ 3913 */
3756int cond_resched_lock(spinlock_t * lock) 3914int cond_resched_lock(spinlock_t * lock)
3757{ 3915{
3916 int ret = 0;
3917
3758 if (need_lockbreak(lock)) { 3918 if (need_lockbreak(lock)) {
3759 spin_unlock(lock); 3919 spin_unlock(lock);
3760 cpu_relax(); 3920 cpu_relax();
3921 ret = 1;
3761 spin_lock(lock); 3922 spin_lock(lock);
3762 } 3923 }
3763 if (need_resched()) { 3924 if (need_resched()) {
3764 _raw_spin_unlock(lock); 3925 _raw_spin_unlock(lock);
3765 preempt_enable_no_resched(); 3926 preempt_enable_no_resched();
3766 __cond_resched(); 3927 __cond_resched();
3928 ret = 1;
3767 spin_lock(lock); 3929 spin_lock(lock);
3768 return 1;
3769 } 3930 }
3770 return 0; 3931 return ret;
3771} 3932}
3772 3933
3773EXPORT_SYMBOL(cond_resched_lock); 3934EXPORT_SYMBOL(cond_resched_lock);
@@ -3811,7 +3972,7 @@ EXPORT_SYMBOL(yield);
3811 */ 3972 */
3812void __sched io_schedule(void) 3973void __sched io_schedule(void)
3813{ 3974{
3814 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3975 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3815 3976
3816 atomic_inc(&rq->nr_iowait); 3977 atomic_inc(&rq->nr_iowait);
3817 schedule(); 3978 schedule();
@@ -3822,7 +3983,7 @@ EXPORT_SYMBOL(io_schedule);
3822 3983
3823long __sched io_schedule_timeout(long timeout) 3984long __sched io_schedule_timeout(long timeout)
3824{ 3985{
3825 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3986 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3826 long ret; 3987 long ret;
3827 3988
3828 atomic_inc(&rq->nr_iowait); 3989 atomic_inc(&rq->nr_iowait);
@@ -4027,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4027 4188
4028 spin_lock_irqsave(&rq->lock, flags); 4189 spin_lock_irqsave(&rq->lock, flags);
4029 rq->curr = rq->idle = idle; 4190 rq->curr = rq->idle = idle;
4191#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4192 idle->oncpu = 1;
4193#endif
4030 set_tsk_need_resched(idle); 4194 set_tsk_need_resched(idle);
4031 spin_unlock_irqrestore(&rq->lock, flags); 4195 spin_unlock_irqrestore(&rq->lock, flags);
4032 4196
@@ -4171,8 +4335,7 @@ static int migration_thread(void * data)
4171 struct list_head *head; 4335 struct list_head *head;
4172 migration_req_t *req; 4336 migration_req_t *req;
4173 4337
4174 if (current->flags & PF_FREEZE) 4338 try_to_freeze();
4175 refrigerator(PF_FREEZE);
4176 4339
4177 spin_lock_irq(&rq->lock); 4340 spin_lock_irq(&rq->lock);
4178 4341
@@ -4197,17 +4360,9 @@ static int migration_thread(void * data)
4197 req = list_entry(head->next, migration_req_t, list); 4360 req = list_entry(head->next, migration_req_t, list);
4198 list_del_init(head->next); 4361 list_del_init(head->next);
4199 4362
4200 if (req->type == REQ_MOVE_TASK) { 4363 spin_unlock(&rq->lock);
4201 spin_unlock(&rq->lock); 4364 __migrate_task(req->task, cpu, req->dest_cpu);
4202 __migrate_task(req->task, cpu, req->dest_cpu); 4365 local_irq_enable();
4203 local_irq_enable();
4204 } else if (req->type == REQ_SET_DOMAIN) {
4205 rq->sd = req->sd;
4206 spin_unlock_irq(&rq->lock);
4207 } else {
4208 spin_unlock_irq(&rq->lock);
4209 WARN_ON(1);
4210 }
4211 4366
4212 complete(&req->done); 4367 complete(&req->done);
4213 } 4368 }
@@ -4438,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4438 migration_req_t *req; 4593 migration_req_t *req;
4439 req = list_entry(rq->migration_queue.next, 4594 req = list_entry(rq->migration_queue.next,
4440 migration_req_t, list); 4595 migration_req_t, list);
4441 BUG_ON(req->type != REQ_MOVE_TASK);
4442 list_del_init(&req->list); 4596 list_del_init(&req->list);
4443 complete(&req->done); 4597 complete(&req->done);
4444 } 4598 }
@@ -4469,12 +4623,17 @@ int __init migration_init(void)
4469#endif 4623#endif
4470 4624
4471#ifdef CONFIG_SMP 4625#ifdef CONFIG_SMP
4472#define SCHED_DOMAIN_DEBUG 4626#undef SCHED_DOMAIN_DEBUG
4473#ifdef SCHED_DOMAIN_DEBUG 4627#ifdef SCHED_DOMAIN_DEBUG
4474static void sched_domain_debug(struct sched_domain *sd, int cpu) 4628static void sched_domain_debug(struct sched_domain *sd, int cpu)
4475{ 4629{
4476 int level = 0; 4630 int level = 0;
4477 4631
4632 if (!sd) {
4633 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4634 return;
4635 }
4636
4478 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4637 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4479 4638
4480 do { 4639 do {
@@ -4557,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4557#define sched_domain_debug(sd, cpu) {} 4716#define sched_domain_debug(sd, cpu) {}
4558#endif 4717#endif
4559 4718
4719static int sd_degenerate(struct sched_domain *sd)
4720{
4721 if (cpus_weight(sd->span) == 1)
4722 return 1;
4723
4724 /* Following flags need at least 2 groups */
4725 if (sd->flags & (SD_LOAD_BALANCE |
4726 SD_BALANCE_NEWIDLE |
4727 SD_BALANCE_FORK |
4728 SD_BALANCE_EXEC)) {
4729 if (sd->groups != sd->groups->next)
4730 return 0;
4731 }
4732
4733 /* Following flags don't use groups */
4734 if (sd->flags & (SD_WAKE_IDLE |
4735 SD_WAKE_AFFINE |
4736 SD_WAKE_BALANCE))
4737 return 0;
4738
4739 return 1;
4740}
4741
4742static int sd_parent_degenerate(struct sched_domain *sd,
4743 struct sched_domain *parent)
4744{
4745 unsigned long cflags = sd->flags, pflags = parent->flags;
4746
4747 if (sd_degenerate(parent))
4748 return 1;
4749
4750 if (!cpus_equal(sd->span, parent->span))
4751 return 0;
4752
4753 /* Does parent contain flags not in child? */
4754 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4755 if (cflags & SD_WAKE_AFFINE)
4756 pflags &= ~SD_WAKE_BALANCE;
4757 /* Flags needing groups don't count if only 1 group in parent */
4758 if (parent->groups == parent->groups->next) {
4759 pflags &= ~(SD_LOAD_BALANCE |
4760 SD_BALANCE_NEWIDLE |
4761 SD_BALANCE_FORK |
4762 SD_BALANCE_EXEC);
4763 }
4764 if (~cflags & pflags)
4765 return 0;
4766
4767 return 1;
4768}
4769
4560/* 4770/*
4561 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4771 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4562 * hold the hotplug lock. 4772 * hold the hotplug lock.
4563 */ 4773 */
4564void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4774void cpu_attach_domain(struct sched_domain *sd, int cpu)
4565{ 4775{
4566 migration_req_t req;
4567 unsigned long flags;
4568 runqueue_t *rq = cpu_rq(cpu); 4776 runqueue_t *rq = cpu_rq(cpu);
4569 int local = 1; 4777 struct sched_domain *tmp;
4570
4571 sched_domain_debug(sd, cpu);
4572 4778
4573 spin_lock_irqsave(&rq->lock, flags); 4779 /* Remove the sched domains which do not contribute to scheduling. */
4574 4780 for (tmp = sd; tmp; tmp = tmp->parent) {
4575 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4781 struct sched_domain *parent = tmp->parent;
4576 rq->sd = sd; 4782 if (!parent)
4577 } else { 4783 break;
4578 init_completion(&req.done); 4784 if (sd_parent_degenerate(tmp, parent))
4579 req.type = REQ_SET_DOMAIN; 4785 tmp->parent = parent->parent;
4580 req.sd = sd;
4581 list_add(&req.list, &rq->migration_queue);
4582 local = 0;
4583 } 4786 }
4584 4787
4585 spin_unlock_irqrestore(&rq->lock, flags); 4788 if (sd && sd_degenerate(sd))
4789 sd = sd->parent;
4586 4790
4587 if (!local) { 4791 sched_domain_debug(sd, cpu);
4588 wake_up_process(rq->migration_thread); 4792
4589 wait_for_completion(&req.done); 4793 rcu_assign_pointer(rq->sd, sd);
4590 }
4591} 4794}
4592 4795
4593/* cpus with isolated domains */ 4796/* cpus with isolated domains */
@@ -4619,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4619 * covered by the given span, and will set each group's ->cpumask correctly, 4822 * covered by the given span, and will set each group's ->cpumask correctly,
4620 * and ->cpu_power to 0. 4823 * and ->cpu_power to 0.
4621 */ 4824 */
4622void __devinit init_sched_build_groups(struct sched_group groups[], 4825void init_sched_build_groups(struct sched_group groups[],
4623 cpumask_t span, int (*group_fn)(int cpu)) 4826 cpumask_t span, int (*group_fn)(int cpu))
4624{ 4827{
4625 struct sched_group *first = NULL, *last = NULL; 4828 struct sched_group *first = NULL, *last = NULL;
@@ -4655,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4655 4858
4656 4859
4657#ifdef ARCH_HAS_SCHED_DOMAIN 4860#ifdef ARCH_HAS_SCHED_DOMAIN
4658extern void __devinit arch_init_sched_domains(void); 4861extern void build_sched_domains(const cpumask_t *cpu_map);
4659extern void __devinit arch_destroy_sched_domains(void); 4862extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4863extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4660#else 4864#else
4661#ifdef CONFIG_SCHED_SMT 4865#ifdef CONFIG_SCHED_SMT
4662static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4866static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4663static struct sched_group sched_group_cpus[NR_CPUS]; 4867static struct sched_group sched_group_cpus[NR_CPUS];
4664static int __devinit cpu_to_cpu_group(int cpu) 4868static int cpu_to_cpu_group(int cpu)
4665{ 4869{
4666 return cpu; 4870 return cpu;
4667} 4871}
@@ -4669,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4669 4873
4670static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4874static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4671static struct sched_group sched_group_phys[NR_CPUS]; 4875static struct sched_group sched_group_phys[NR_CPUS];
4672static int __devinit cpu_to_phys_group(int cpu) 4876static int cpu_to_phys_group(int cpu)
4673{ 4877{
4674#ifdef CONFIG_SCHED_SMT 4878#ifdef CONFIG_SCHED_SMT
4675 return first_cpu(cpu_sibling_map[cpu]); 4879 return first_cpu(cpu_sibling_map[cpu]);
@@ -4682,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4682 4886
4683static DEFINE_PER_CPU(struct sched_domain, node_domains); 4887static DEFINE_PER_CPU(struct sched_domain, node_domains);
4684static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4888static struct sched_group sched_group_nodes[MAX_NUMNODES];
4685static int __devinit cpu_to_node_group(int cpu) 4889static int cpu_to_node_group(int cpu)
4686{ 4890{
4687 return cpu_to_node(cpu); 4891 return cpu_to_node(cpu);
4688} 4892}
@@ -4713,39 +4917,28 @@ static void check_sibling_maps(void)
4713#endif 4917#endif
4714 4918
4715/* 4919/*
4716 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4920 * Build sched domains for a given set of cpus and attach the sched domains
4921 * to the individual cpus
4717 */ 4922 */
4718static void __devinit arch_init_sched_domains(void) 4923static void build_sched_domains(const cpumask_t *cpu_map)
4719{ 4924{
4720 int i; 4925 int i;
4721 cpumask_t cpu_default_map;
4722
4723#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4724 check_sibling_maps();
4725#endif
4726 /*
4727 * Setup mask for cpus without special case scheduling requirements.
4728 * For now this just excludes isolated cpus, but could be used to
4729 * exclude other special cases in the future.
4730 */
4731 cpus_complement(cpu_default_map, cpu_isolated_map);
4732 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4733 4926
4734 /* 4927 /*
4735 * Set up domains. Isolated domains just stay on the dummy domain. 4928 * Set up domains for cpus specified by the cpu_map.
4736 */ 4929 */
4737 for_each_cpu_mask(i, cpu_default_map) { 4930 for_each_cpu_mask(i, *cpu_map) {
4738 int group; 4931 int group;
4739 struct sched_domain *sd = NULL, *p; 4932 struct sched_domain *sd = NULL, *p;
4740 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4933 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4741 4934
4742 cpus_and(nodemask, nodemask, cpu_default_map); 4935 cpus_and(nodemask, nodemask, *cpu_map);
4743 4936
4744#ifdef CONFIG_NUMA 4937#ifdef CONFIG_NUMA
4745 sd = &per_cpu(node_domains, i); 4938 sd = &per_cpu(node_domains, i);
4746 group = cpu_to_node_group(i); 4939 group = cpu_to_node_group(i);
4747 *sd = SD_NODE_INIT; 4940 *sd = SD_NODE_INIT;
4748 sd->span = cpu_default_map; 4941 sd->span = *cpu_map;
4749 sd->groups = &sched_group_nodes[group]; 4942 sd->groups = &sched_group_nodes[group];
4750#endif 4943#endif
4751 4944
@@ -4763,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
4763 group = cpu_to_cpu_group(i); 4956 group = cpu_to_cpu_group(i);
4764 *sd = SD_SIBLING_INIT; 4957 *sd = SD_SIBLING_INIT;
4765 sd->span = cpu_sibling_map[i]; 4958 sd->span = cpu_sibling_map[i];
4766 cpus_and(sd->span, sd->span, cpu_default_map); 4959 cpus_and(sd->span, sd->span, *cpu_map);
4767 sd->parent = p; 4960 sd->parent = p;
4768 sd->groups = &sched_group_cpus[group]; 4961 sd->groups = &sched_group_cpus[group];
4769#endif 4962#endif
@@ -4773,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
4773 /* Set up CPU (sibling) groups */ 4966 /* Set up CPU (sibling) groups */
4774 for_each_online_cpu(i) { 4967 for_each_online_cpu(i) {
4775 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4968 cpumask_t this_sibling_map = cpu_sibling_map[i];
4776 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4969 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4777 if (i != first_cpu(this_sibling_map)) 4970 if (i != first_cpu(this_sibling_map))
4778 continue; 4971 continue;
4779 4972
@@ -4786,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
4786 for (i = 0; i < MAX_NUMNODES; i++) { 4979 for (i = 0; i < MAX_NUMNODES; i++) {
4787 cpumask_t nodemask = node_to_cpumask(i); 4980 cpumask_t nodemask = node_to_cpumask(i);
4788 4981
4789 cpus_and(nodemask, nodemask, cpu_default_map); 4982 cpus_and(nodemask, nodemask, *cpu_map);
4790 if (cpus_empty(nodemask)) 4983 if (cpus_empty(nodemask))
4791 continue; 4984 continue;
4792 4985
@@ -4796,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
4796 4989
4797#ifdef CONFIG_NUMA 4990#ifdef CONFIG_NUMA
4798 /* Set up node groups */ 4991 /* Set up node groups */
4799 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4992 init_sched_build_groups(sched_group_nodes, *cpu_map,
4800 &cpu_to_node_group); 4993 &cpu_to_node_group);
4801#endif 4994#endif
4802 4995
4803 /* Calculate CPU power for physical packages and nodes */ 4996 /* Calculate CPU power for physical packages and nodes */
4804 for_each_cpu_mask(i, cpu_default_map) { 4997 for_each_cpu_mask(i, *cpu_map) {
4805 int power; 4998 int power;
4806 struct sched_domain *sd; 4999 struct sched_domain *sd;
4807#ifdef CONFIG_SCHED_SMT 5000#ifdef CONFIG_SCHED_SMT
@@ -4825,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
4825 } 5018 }
4826 5019
4827 /* Attach the domains */ 5020 /* Attach the domains */
4828 for_each_online_cpu(i) { 5021 for_each_cpu_mask(i, *cpu_map) {
4829 struct sched_domain *sd; 5022 struct sched_domain *sd;
4830#ifdef CONFIG_SCHED_SMT 5023#ifdef CONFIG_SCHED_SMT
4831 sd = &per_cpu(cpu_domains, i); 5024 sd = &per_cpu(cpu_domains, i);
@@ -4835,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
4835 cpu_attach_domain(sd, i); 5028 cpu_attach_domain(sd, i);
4836 } 5029 }
4837} 5030}
5031/*
5032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5033 */
5034static void arch_init_sched_domains(cpumask_t *cpu_map)
5035{
5036 cpumask_t cpu_default_map;
5037
5038#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5039 check_sibling_maps();
5040#endif
5041 /*
5042 * Setup mask for cpus without special case scheduling requirements.
5043 * For now this just excludes isolated cpus, but could be used to
5044 * exclude other special cases in the future.
5045 */
5046 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5047
5048 build_sched_domains(&cpu_default_map);
5049}
4838 5050
4839#ifdef CONFIG_HOTPLUG_CPU 5051static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
4840static void __devinit arch_destroy_sched_domains(void)
4841{ 5052{
4842 /* Do nothing: everything is statically allocated. */ 5053 /* Do nothing: everything is statically allocated. */
4843} 5054}
4844#endif
4845 5055
4846#endif /* ARCH_HAS_SCHED_DOMAIN */ 5056#endif /* ARCH_HAS_SCHED_DOMAIN */
4847 5057
4848/* 5058/*
4849 * Initial dummy domain for early boot and for hotplug cpu. Being static, 5059 * Detach sched domains from a group of cpus specified in cpu_map
4850 * it is initialized to zero, so all balancing flags are cleared which is 5060 * These cpus will now be attached to the NULL domain
4851 * what we want.
4852 */ 5061 */
4853static struct sched_domain sched_domain_dummy; 5062static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5063{
5064 int i;
5065
5066 for_each_cpu_mask(i, *cpu_map)
5067 cpu_attach_domain(NULL, i);
5068 synchronize_sched();
5069 arch_destroy_sched_domains(cpu_map);
5070}
5071
5072/*
5073 * Partition sched domains as specified by the cpumasks below.
5074 * This attaches all cpus from the cpumasks to the NULL domain,
5075 * waits for a RCU quiescent period, recalculates sched
5076 * domain information and then attaches them back to the
5077 * correct sched domains
5078 * Call with hotplug lock held
5079 */
5080void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5081{
5082 cpumask_t change_map;
5083
5084 cpus_and(*partition1, *partition1, cpu_online_map);
5085 cpus_and(*partition2, *partition2, cpu_online_map);
5086 cpus_or(change_map, *partition1, *partition2);
5087
5088 /* Detach sched domains from all of the affected cpus */
5089 detach_destroy_domains(&change_map);
5090 if (!cpus_empty(*partition1))
5091 build_sched_domains(partition1);
5092 if (!cpus_empty(*partition2))
5093 build_sched_domains(partition2);
5094}
4854 5095
4855#ifdef CONFIG_HOTPLUG_CPU 5096#ifdef CONFIG_HOTPLUG_CPU
4856/* 5097/*
4857 * Force a reinitialization of the sched domains hierarchy. The domains 5098 * Force a reinitialization of the sched domains hierarchy. The domains
4858 * and groups cannot be updated in place without racing with the balancing 5099 * and groups cannot be updated in place without racing with the balancing
4859 * code, so we temporarily attach all running cpus to a "dummy" domain 5100 * code, so we temporarily attach all running cpus to the NULL domain
4860 * which will prevent rebalancing while the sched domains are recalculated. 5101 * which will prevent rebalancing while the sched domains are recalculated.
4861 */ 5102 */
4862static int update_sched_domains(struct notifier_block *nfb, 5103static int update_sched_domains(struct notifier_block *nfb,
4863 unsigned long action, void *hcpu) 5104 unsigned long action, void *hcpu)
4864{ 5105{
4865 int i;
4866
4867 switch (action) { 5106 switch (action) {
4868 case CPU_UP_PREPARE: 5107 case CPU_UP_PREPARE:
4869 case CPU_DOWN_PREPARE: 5108 case CPU_DOWN_PREPARE:
4870 for_each_online_cpu(i) 5109 detach_destroy_domains(&cpu_online_map);
4871 cpu_attach_domain(&sched_domain_dummy, i);
4872 arch_destroy_sched_domains();
4873 return NOTIFY_OK; 5110 return NOTIFY_OK;
4874 5111
4875 case CPU_UP_CANCELED: 5112 case CPU_UP_CANCELED:
@@ -4885,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4885 } 5122 }
4886 5123
4887 /* The hotplug lock is already held by cpu_up/cpu_down */ 5124 /* The hotplug lock is already held by cpu_up/cpu_down */
4888 arch_init_sched_domains(); 5125 arch_init_sched_domains(&cpu_online_map);
4889 5126
4890 return NOTIFY_OK; 5127 return NOTIFY_OK;
4891} 5128}
@@ -4894,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4894void __init sched_init_smp(void) 5131void __init sched_init_smp(void)
4895{ 5132{
4896 lock_cpu_hotplug(); 5133 lock_cpu_hotplug();
4897 arch_init_sched_domains(); 5134 arch_init_sched_domains(&cpu_online_map);
4898 unlock_cpu_hotplug(); 5135 unlock_cpu_hotplug();
4899 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5136 /* XXX: Theoretical race here - CPU may be hotplugged now */
4900 hotcpu_notifier(update_sched_domains, 0); 5137 hotcpu_notifier(update_sched_domains, 0);
@@ -4924,13 +5161,15 @@ void __init sched_init(void)
4924 5161
4925 rq = cpu_rq(i); 5162 rq = cpu_rq(i);
4926 spin_lock_init(&rq->lock); 5163 spin_lock_init(&rq->lock);
5164 rq->nr_running = 0;
4927 rq->active = rq->arrays; 5165 rq->active = rq->arrays;
4928 rq->expired = rq->arrays + 1; 5166 rq->expired = rq->arrays + 1;
4929 rq->best_expired_prio = MAX_PRIO; 5167 rq->best_expired_prio = MAX_PRIO;
4930 5168
4931#ifdef CONFIG_SMP 5169#ifdef CONFIG_SMP
4932 rq->sd = &sched_domain_dummy; 5170 rq->sd = NULL;
4933 rq->cpu_load = 0; 5171 for (j = 1; j < 3; j++)
5172 rq->cpu_load[j] = 0;
4934 rq->active_balance = 0; 5173 rq->active_balance = 0;
4935 rq->push_cpu = 0; 5174 rq->push_cpu = 0;
4936 rq->migration_thread = NULL; 5175 rq->migration_thread = NULL;