diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1073 |
1 files changed, 654 insertions, 419 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f12a0c8a7d98..5f2182d42241 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -166,7 +166,7 @@ | |||
166 | #define SCALE_PRIO(x, prio) \ | 166 | #define SCALE_PRIO(x, prio) \ |
167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) |
168 | 168 | ||
169 | static inline unsigned int task_timeslice(task_t *p) | 169 | static unsigned int task_timeslice(task_t *p) |
170 | { | 170 | { |
171 | if (p->static_prio < NICE_TO_PRIO(0)) | 171 | if (p->static_prio < NICE_TO_PRIO(0)) |
172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); |
@@ -206,7 +206,7 @@ struct runqueue { | |||
206 | */ | 206 | */ |
207 | unsigned long nr_running; | 207 | unsigned long nr_running; |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
209 | unsigned long cpu_load; | 209 | unsigned long cpu_load[3]; |
210 | #endif | 210 | #endif |
211 | unsigned long long nr_switches; | 211 | unsigned long long nr_switches; |
212 | 212 | ||
@@ -260,23 +260,87 @@ struct runqueue { | |||
260 | 260 | ||
261 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 261 | static DEFINE_PER_CPU(struct runqueue, runqueues); |
262 | 262 | ||
263 | /* | ||
264 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
265 | * See detach_destroy_domains: synchronize_sched for details. | ||
266 | * | ||
267 | * The domain tree of any CPU may only be accessed from within | ||
268 | * preempt-disabled sections. | ||
269 | */ | ||
263 | #define for_each_domain(cpu, domain) \ | 270 | #define for_each_domain(cpu, domain) \ |
264 | for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) | 271 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) |
265 | 272 | ||
266 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 273 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
267 | #define this_rq() (&__get_cpu_var(runqueues)) | 274 | #define this_rq() (&__get_cpu_var(runqueues)) |
268 | #define task_rq(p) cpu_rq(task_cpu(p)) | 275 | #define task_rq(p) cpu_rq(task_cpu(p)) |
269 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 276 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
270 | 277 | ||
271 | /* | ||
272 | * Default context-switch locking: | ||
273 | */ | ||
274 | #ifndef prepare_arch_switch | 278 | #ifndef prepare_arch_switch |
275 | # define prepare_arch_switch(rq, next) do { } while (0) | 279 | # define prepare_arch_switch(next) do { } while (0) |
276 | # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) | 280 | #endif |
277 | # define task_running(rq, p) ((rq)->curr == (p)) | 281 | #ifndef finish_arch_switch |
282 | # define finish_arch_switch(prev) do { } while (0) | ||
278 | #endif | 283 | #endif |
279 | 284 | ||
285 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
286 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
287 | { | ||
288 | return rq->curr == p; | ||
289 | } | ||
290 | |||
291 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
292 | { | ||
293 | } | ||
294 | |||
295 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
296 | { | ||
297 | spin_unlock_irq(&rq->lock); | ||
298 | } | ||
299 | |||
300 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
301 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
302 | { | ||
303 | #ifdef CONFIG_SMP | ||
304 | return p->oncpu; | ||
305 | #else | ||
306 | return rq->curr == p; | ||
307 | #endif | ||
308 | } | ||
309 | |||
310 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
311 | { | ||
312 | #ifdef CONFIG_SMP | ||
313 | /* | ||
314 | * We can optimise this out completely for !SMP, because the | ||
315 | * SMP rebalancing from interrupt is the only thing that cares | ||
316 | * here. | ||
317 | */ | ||
318 | next->oncpu = 1; | ||
319 | #endif | ||
320 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
321 | spin_unlock_irq(&rq->lock); | ||
322 | #else | ||
323 | spin_unlock(&rq->lock); | ||
324 | #endif | ||
325 | } | ||
326 | |||
327 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
328 | { | ||
329 | #ifdef CONFIG_SMP | ||
330 | /* | ||
331 | * After ->oncpu is cleared, the task can be moved to a different CPU. | ||
332 | * We must ensure this doesn't happen until the switch is completely | ||
333 | * finished. | ||
334 | */ | ||
335 | smp_wmb(); | ||
336 | prev->oncpu = 0; | ||
337 | #endif | ||
338 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
339 | local_irq_enable(); | ||
340 | #endif | ||
341 | } | ||
342 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
343 | |||
280 | /* | 344 | /* |
281 | * task_rq_lock - lock the runqueue a given task resides on and disable | 345 | * task_rq_lock - lock the runqueue a given task resides on and disable |
282 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 346 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | |||
309 | * bump this up when changing the output format or the meaning of an existing | 373 | * bump this up when changing the output format or the meaning of an existing |
310 | * format, so that tools can adapt (or abort) | 374 | * format, so that tools can adapt (or abort) |
311 | */ | 375 | */ |
312 | #define SCHEDSTAT_VERSION 11 | 376 | #define SCHEDSTAT_VERSION 12 |
313 | 377 | ||
314 | static int show_schedstat(struct seq_file *seq, void *v) | 378 | static int show_schedstat(struct seq_file *seq, void *v) |
315 | { | 379 | { |
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
338 | 402 | ||
339 | #ifdef CONFIG_SMP | 403 | #ifdef CONFIG_SMP |
340 | /* domain-specific stats */ | 404 | /* domain-specific stats */ |
405 | preempt_disable(); | ||
341 | for_each_domain(cpu, sd) { | 406 | for_each_domain(cpu, sd) { |
342 | enum idle_type itype; | 407 | enum idle_type itype; |
343 | char mask_str[NR_CPUS]; | 408 | char mask_str[NR_CPUS]; |
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
356 | sd->lb_nobusyq[itype], | 421 | sd->lb_nobusyq[itype], |
357 | sd->lb_nobusyg[itype]); | 422 | sd->lb_nobusyg[itype]); |
358 | } | 423 | } |
359 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", | 424 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", |
360 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 425 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
361 | sd->sbe_pushed, sd->sbe_attempts, | 426 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
427 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | ||
362 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 428 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); |
363 | } | 429 | } |
430 | preempt_enable(); | ||
364 | #endif | 431 | #endif |
365 | } | 432 | } |
366 | return 0; | 433 | return 0; |
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) | |||
414 | return rq; | 481 | return rq; |
415 | } | 482 | } |
416 | 483 | ||
417 | #ifdef CONFIG_SCHED_SMT | ||
418 | static int cpu_and_siblings_are_idle(int cpu) | ||
419 | { | ||
420 | int sib; | ||
421 | for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { | ||
422 | if (idle_cpu(sib)) | ||
423 | continue; | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | return 1; | ||
428 | } | ||
429 | #else | ||
430 | #define cpu_and_siblings_are_idle(A) idle_cpu(A) | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_SCHEDSTATS | 484 | #ifdef CONFIG_SCHEDSTATS |
434 | /* | 485 | /* |
435 | * Called when a process is dequeued from the active array and given | 486 | * Called when a process is dequeued from the active array and given |
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | |||
622 | rq->nr_running++; | 673 | rq->nr_running++; |
623 | } | 674 | } |
624 | 675 | ||
625 | static void recalc_task_prio(task_t *p, unsigned long long now) | 676 | static int recalc_task_prio(task_t *p, unsigned long long now) |
626 | { | 677 | { |
627 | /* Caller must always ensure 'now >= p->timestamp' */ | 678 | /* Caller must always ensure 'now >= p->timestamp' */ |
628 | unsigned long long __sleep_time = now - p->timestamp; | 679 | unsigned long long __sleep_time = now - p->timestamp; |
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) | |||
681 | } | 732 | } |
682 | } | 733 | } |
683 | 734 | ||
684 | p->prio = effective_prio(p); | 735 | return effective_prio(p); |
685 | } | 736 | } |
686 | 737 | ||
687 | /* | 738 | /* |
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
704 | } | 755 | } |
705 | #endif | 756 | #endif |
706 | 757 | ||
707 | recalc_task_prio(p, now); | 758 | p->prio = recalc_task_prio(p, now); |
708 | 759 | ||
709 | /* | 760 | /* |
710 | * This checks to make sure it's not an uninterruptible task | 761 | * This checks to make sure it's not an uninterruptible task |
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) | |||
782 | } | 833 | } |
783 | 834 | ||
784 | #ifdef CONFIG_SMP | 835 | #ifdef CONFIG_SMP |
785 | enum request_type { | ||
786 | REQ_MOVE_TASK, | ||
787 | REQ_SET_DOMAIN, | ||
788 | }; | ||
789 | |||
790 | typedef struct { | 836 | typedef struct { |
791 | struct list_head list; | 837 | struct list_head list; |
792 | enum request_type type; | ||
793 | 838 | ||
794 | /* For REQ_MOVE_TASK */ | ||
795 | task_t *task; | 839 | task_t *task; |
796 | int dest_cpu; | 840 | int dest_cpu; |
797 | 841 | ||
798 | /* For REQ_SET_DOMAIN */ | ||
799 | struct sched_domain *sd; | ||
800 | |||
801 | struct completion done; | 842 | struct completion done; |
802 | } migration_req_t; | 843 | } migration_req_t; |
803 | 844 | ||
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
819 | } | 860 | } |
820 | 861 | ||
821 | init_completion(&req->done); | 862 | init_completion(&req->done); |
822 | req->type = REQ_MOVE_TASK; | ||
823 | req->task = p; | 863 | req->task = p; |
824 | req->dest_cpu = dest_cpu; | 864 | req->dest_cpu = dest_cpu; |
825 | list_add(&req->list, &rq->migration_queue); | 865 | list_add(&req->list, &rq->migration_queue); |
@@ -886,26 +926,154 @@ void kick_process(task_t *p) | |||
886 | * We want to under-estimate the load of migration sources, to | 926 | * We want to under-estimate the load of migration sources, to |
887 | * balance conservatively. | 927 | * balance conservatively. |
888 | */ | 928 | */ |
889 | static inline unsigned long source_load(int cpu) | 929 | static inline unsigned long source_load(int cpu, int type) |
890 | { | 930 | { |
891 | runqueue_t *rq = cpu_rq(cpu); | 931 | runqueue_t *rq = cpu_rq(cpu); |
892 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 932 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
933 | if (type == 0) | ||
934 | return load_now; | ||
893 | 935 | ||
894 | return min(rq->cpu_load, load_now); | 936 | return min(rq->cpu_load[type-1], load_now); |
895 | } | 937 | } |
896 | 938 | ||
897 | /* | 939 | /* |
898 | * Return a high guess at the load of a migration-target cpu | 940 | * Return a high guess at the load of a migration-target cpu |
899 | */ | 941 | */ |
900 | static inline unsigned long target_load(int cpu) | 942 | static inline unsigned long target_load(int cpu, int type) |
901 | { | 943 | { |
902 | runqueue_t *rq = cpu_rq(cpu); | 944 | runqueue_t *rq = cpu_rq(cpu); |
903 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 945 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
946 | if (type == 0) | ||
947 | return load_now; | ||
904 | 948 | ||
905 | return max(rq->cpu_load, load_now); | 949 | return max(rq->cpu_load[type-1], load_now); |
906 | } | 950 | } |
907 | 951 | ||
908 | #endif | 952 | /* |
953 | * find_idlest_group finds and returns the least busy CPU group within the | ||
954 | * domain. | ||
955 | */ | ||
956 | static struct sched_group * | ||
957 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
958 | { | ||
959 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
960 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
961 | int load_idx = sd->forkexec_idx; | ||
962 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
963 | |||
964 | do { | ||
965 | unsigned long load, avg_load; | ||
966 | int local_group; | ||
967 | int i; | ||
968 | |||
969 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
970 | /* XXX: put a cpus allowed check */ | ||
971 | |||
972 | /* Tally up the load of all CPUs in the group */ | ||
973 | avg_load = 0; | ||
974 | |||
975 | for_each_cpu_mask(i, group->cpumask) { | ||
976 | /* Bias balancing toward cpus of our domain */ | ||
977 | if (local_group) | ||
978 | load = source_load(i, load_idx); | ||
979 | else | ||
980 | load = target_load(i, load_idx); | ||
981 | |||
982 | avg_load += load; | ||
983 | } | ||
984 | |||
985 | /* Adjust by relative CPU power of the group */ | ||
986 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
987 | |||
988 | if (local_group) { | ||
989 | this_load = avg_load; | ||
990 | this = group; | ||
991 | } else if (avg_load < min_load) { | ||
992 | min_load = avg_load; | ||
993 | idlest = group; | ||
994 | } | ||
995 | group = group->next; | ||
996 | } while (group != sd->groups); | ||
997 | |||
998 | if (!idlest || 100*this_load < imbalance*min_load) | ||
999 | return NULL; | ||
1000 | return idlest; | ||
1001 | } | ||
1002 | |||
1003 | /* | ||
1004 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | ||
1005 | */ | ||
1006 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | ||
1007 | { | ||
1008 | unsigned long load, min_load = ULONG_MAX; | ||
1009 | int idlest = -1; | ||
1010 | int i; | ||
1011 | |||
1012 | for_each_cpu_mask(i, group->cpumask) { | ||
1013 | load = source_load(i, 0); | ||
1014 | |||
1015 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
1016 | min_load = load; | ||
1017 | idlest = i; | ||
1018 | } | ||
1019 | } | ||
1020 | |||
1021 | return idlest; | ||
1022 | } | ||
1023 | |||
1024 | /* | ||
1025 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
1026 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
1027 | * SD_BALANCE_EXEC. | ||
1028 | * | ||
1029 | * Balance, ie. select the least loaded group. | ||
1030 | * | ||
1031 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
1032 | * | ||
1033 | * preempt must be disabled. | ||
1034 | */ | ||
1035 | static int sched_balance_self(int cpu, int flag) | ||
1036 | { | ||
1037 | struct task_struct *t = current; | ||
1038 | struct sched_domain *tmp, *sd = NULL; | ||
1039 | |||
1040 | for_each_domain(cpu, tmp) | ||
1041 | if (tmp->flags & flag) | ||
1042 | sd = tmp; | ||
1043 | |||
1044 | while (sd) { | ||
1045 | cpumask_t span; | ||
1046 | struct sched_group *group; | ||
1047 | int new_cpu; | ||
1048 | int weight; | ||
1049 | |||
1050 | span = sd->span; | ||
1051 | group = find_idlest_group(sd, t, cpu); | ||
1052 | if (!group) | ||
1053 | goto nextlevel; | ||
1054 | |||
1055 | new_cpu = find_idlest_cpu(group, cpu); | ||
1056 | if (new_cpu == -1 || new_cpu == cpu) | ||
1057 | goto nextlevel; | ||
1058 | |||
1059 | /* Now try balancing at a lower domain level */ | ||
1060 | cpu = new_cpu; | ||
1061 | nextlevel: | ||
1062 | sd = NULL; | ||
1063 | weight = cpus_weight(span); | ||
1064 | for_each_domain(cpu, tmp) { | ||
1065 | if (weight <= cpus_weight(tmp->span)) | ||
1066 | break; | ||
1067 | if (tmp->flags & flag) | ||
1068 | sd = tmp; | ||
1069 | } | ||
1070 | /* while loop will break here if sd == NULL */ | ||
1071 | } | ||
1072 | |||
1073 | return cpu; | ||
1074 | } | ||
1075 | |||
1076 | #endif /* CONFIG_SMP */ | ||
909 | 1077 | ||
910 | /* | 1078 | /* |
911 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1079 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) | |||
927 | 1095 | ||
928 | for_each_domain(cpu, sd) { | 1096 | for_each_domain(cpu, sd) { |
929 | if (sd->flags & SD_WAKE_IDLE) { | 1097 | if (sd->flags & SD_WAKE_IDLE) { |
930 | cpus_and(tmp, sd->span, cpu_online_map); | 1098 | cpus_and(tmp, sd->span, p->cpus_allowed); |
931 | cpus_and(tmp, tmp, p->cpus_allowed); | ||
932 | for_each_cpu_mask(i, tmp) { | 1099 | for_each_cpu_mask(i, tmp) { |
933 | if (idle_cpu(i)) | 1100 | if (idle_cpu(i)) |
934 | return i; | 1101 | return i; |
935 | } | 1102 | } |
936 | } | 1103 | } |
937 | else break; | 1104 | else |
1105 | break; | ||
938 | } | 1106 | } |
939 | return cpu; | 1107 | return cpu; |
940 | } | 1108 | } |
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
967 | runqueue_t *rq; | 1135 | runqueue_t *rq; |
968 | #ifdef CONFIG_SMP | 1136 | #ifdef CONFIG_SMP |
969 | unsigned long load, this_load; | 1137 | unsigned long load, this_load; |
970 | struct sched_domain *sd; | 1138 | struct sched_domain *sd, *this_sd = NULL; |
971 | int new_cpu; | 1139 | int new_cpu; |
972 | #endif | 1140 | #endif |
973 | 1141 | ||
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
986 | if (unlikely(task_running(rq, p))) | 1154 | if (unlikely(task_running(rq, p))) |
987 | goto out_activate; | 1155 | goto out_activate; |
988 | 1156 | ||
989 | #ifdef CONFIG_SCHEDSTATS | 1157 | new_cpu = cpu; |
1158 | |||
990 | schedstat_inc(rq, ttwu_cnt); | 1159 | schedstat_inc(rq, ttwu_cnt); |
991 | if (cpu == this_cpu) { | 1160 | if (cpu == this_cpu) { |
992 | schedstat_inc(rq, ttwu_local); | 1161 | schedstat_inc(rq, ttwu_local); |
993 | } else { | 1162 | goto out_set_cpu; |
994 | for_each_domain(this_cpu, sd) { | 1163 | } |
995 | if (cpu_isset(cpu, sd->span)) { | 1164 | |
996 | schedstat_inc(sd, ttwu_wake_remote); | 1165 | for_each_domain(this_cpu, sd) { |
997 | break; | 1166 | if (cpu_isset(cpu, sd->span)) { |
998 | } | 1167 | schedstat_inc(sd, ttwu_wake_remote); |
1168 | this_sd = sd; | ||
1169 | break; | ||
999 | } | 1170 | } |
1000 | } | 1171 | } |
1001 | #endif | ||
1002 | 1172 | ||
1003 | new_cpu = cpu; | 1173 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1004 | if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | 1174 | goto out_set_cpu; |
1006 | 1175 | ||
1007 | load = source_load(cpu); | ||
1008 | this_load = target_load(this_cpu); | ||
1009 | |||
1010 | /* | 1176 | /* |
1011 | * If sync wakeup then subtract the (maximum possible) effect of | 1177 | * Check for affine wakeup and passive balancing possibilities. |
1012 | * the currently running task from the load of the current CPU: | ||
1013 | */ | 1178 | */ |
1014 | if (sync) | 1179 | if (this_sd) { |
1015 | this_load -= SCHED_LOAD_SCALE; | 1180 | int idx = this_sd->wake_idx; |
1181 | unsigned int imbalance; | ||
1016 | 1182 | ||
1017 | /* Don't pull the task off an idle CPU to a busy one */ | 1183 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1018 | if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) | ||
1019 | goto out_set_cpu; | ||
1020 | 1184 | ||
1021 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1185 | load = source_load(cpu, idx); |
1186 | this_load = target_load(this_cpu, idx); | ||
1022 | 1187 | ||
1023 | /* | 1188 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
1024 | * Scan domains for affine wakeup and passive balancing | ||
1025 | * possibilities. | ||
1026 | */ | ||
1027 | for_each_domain(this_cpu, sd) { | ||
1028 | unsigned int imbalance; | ||
1029 | /* | ||
1030 | * Start passive balancing when half the imbalance_pct | ||
1031 | * limit is reached. | ||
1032 | */ | ||
1033 | imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; | ||
1034 | 1189 | ||
1035 | if ((sd->flags & SD_WAKE_AFFINE) && | 1190 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1036 | !task_hot(p, rq->timestamp_last_tick, sd)) { | 1191 | unsigned long tl = this_load; |
1037 | /* | 1192 | /* |
1038 | * This domain has SD_WAKE_AFFINE and p is cache cold | 1193 | * If sync wakeup then subtract the (maximum possible) |
1039 | * in this domain. | 1194 | * effect of the currently running task from the load |
1195 | * of the current CPU: | ||
1040 | */ | 1196 | */ |
1041 | if (cpu_isset(cpu, sd->span)) { | 1197 | if (sync) |
1042 | schedstat_inc(sd, ttwu_move_affine); | 1198 | tl -= SCHED_LOAD_SCALE; |
1199 | |||
1200 | if ((tl <= load && | ||
1201 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | ||
1202 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | ||
1203 | /* | ||
1204 | * This domain has SD_WAKE_AFFINE and | ||
1205 | * p is cache cold in this domain, and | ||
1206 | * there is no bad imbalance. | ||
1207 | */ | ||
1208 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1043 | goto out_set_cpu; | 1209 | goto out_set_cpu; |
1044 | } | 1210 | } |
1045 | } else if ((sd->flags & SD_WAKE_BALANCE) && | 1211 | } |
1046 | imbalance*this_load <= 100*load) { | 1212 | |
1047 | /* | 1213 | /* |
1048 | * This domain has SD_WAKE_BALANCE and there is | 1214 | * Start passive balancing when half the imbalance_pct |
1049 | * an imbalance. | 1215 | * limit is reached. |
1050 | */ | 1216 | */ |
1051 | if (cpu_isset(cpu, sd->span)) { | 1217 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1052 | schedstat_inc(sd, ttwu_move_balance); | 1218 | if (imbalance*this_load <= 100*load) { |
1219 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1053 | goto out_set_cpu; | 1220 | goto out_set_cpu; |
1054 | } | 1221 | } |
1055 | } | 1222 | } |
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
1120 | return try_to_wake_up(p, state, 0); | 1287 | return try_to_wake_up(p, state, 0); |
1121 | } | 1288 | } |
1122 | 1289 | ||
1123 | #ifdef CONFIG_SMP | ||
1124 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1125 | struct sched_domain *sd); | ||
1126 | #endif | ||
1127 | |||
1128 | /* | 1290 | /* |
1129 | * Perform scheduler related setup for a newly forked process p. | 1291 | * Perform scheduler related setup for a newly forked process p. |
1130 | * p is forked by current. | 1292 | * p is forked by current. |
1131 | */ | 1293 | */ |
1132 | void fastcall sched_fork(task_t *p) | 1294 | void fastcall sched_fork(task_t *p, int clone_flags) |
1133 | { | 1295 | { |
1296 | int cpu = get_cpu(); | ||
1297 | |||
1298 | #ifdef CONFIG_SMP | ||
1299 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
1300 | #endif | ||
1301 | set_task_cpu(p, cpu); | ||
1302 | |||
1134 | /* | 1303 | /* |
1135 | * We mark the process as running here, but have not actually | 1304 | * We mark the process as running here, but have not actually |
1136 | * inserted it onto the runqueue yet. This guarantees that | 1305 | * inserted it onto the runqueue yet. This guarantees that |
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) | |||
1140 | p->state = TASK_RUNNING; | 1309 | p->state = TASK_RUNNING; |
1141 | INIT_LIST_HEAD(&p->run_list); | 1310 | INIT_LIST_HEAD(&p->run_list); |
1142 | p->array = NULL; | 1311 | p->array = NULL; |
1143 | spin_lock_init(&p->switch_lock); | ||
1144 | #ifdef CONFIG_SCHEDSTATS | 1312 | #ifdef CONFIG_SCHEDSTATS |
1145 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1313 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1146 | #endif | 1314 | #endif |
1315 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1316 | p->oncpu = 0; | ||
1317 | #endif | ||
1147 | #ifdef CONFIG_PREEMPT | 1318 | #ifdef CONFIG_PREEMPT |
1148 | /* | 1319 | /* Want to start with kernel preemption disabled. */ |
1149 | * During context-switch we hold precisely one spinlock, which | ||
1150 | * schedule_tail drops. (in the common case it's this_rq()->lock, | ||
1151 | * but it also can be p->switch_lock.) So we compensate with a count | ||
1152 | * of 1. Also, we want to start with kernel preemption disabled. | ||
1153 | */ | ||
1154 | p->thread_info->preempt_count = 1; | 1320 | p->thread_info->preempt_count = 1; |
1155 | #endif | 1321 | #endif |
1156 | /* | 1322 | /* |
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) | |||
1174 | * runqueue lock is not a problem. | 1340 | * runqueue lock is not a problem. |
1175 | */ | 1341 | */ |
1176 | current->time_slice = 1; | 1342 | current->time_slice = 1; |
1177 | preempt_disable(); | ||
1178 | scheduler_tick(); | 1343 | scheduler_tick(); |
1179 | local_irq_enable(); | 1344 | } |
1180 | preempt_enable(); | 1345 | local_irq_enable(); |
1181 | } else | 1346 | put_cpu(); |
1182 | local_irq_enable(); | ||
1183 | } | 1347 | } |
1184 | 1348 | ||
1185 | /* | 1349 | /* |
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
1196 | runqueue_t *rq, *this_rq; | 1360 | runqueue_t *rq, *this_rq; |
1197 | 1361 | ||
1198 | rq = task_rq_lock(p, &flags); | 1362 | rq = task_rq_lock(p, &flags); |
1199 | cpu = task_cpu(p); | ||
1200 | this_cpu = smp_processor_id(); | ||
1201 | |||
1202 | BUG_ON(p->state != TASK_RUNNING); | 1363 | BUG_ON(p->state != TASK_RUNNING); |
1364 | this_cpu = smp_processor_id(); | ||
1365 | cpu = task_cpu(p); | ||
1203 | 1366 | ||
1204 | /* | 1367 | /* |
1205 | * We decrease the sleep average of forking parents | 1368 | * We decrease the sleep average of forking parents |
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) | |||
1296 | } | 1459 | } |
1297 | 1460 | ||
1298 | /** | 1461 | /** |
1462 | * prepare_task_switch - prepare to switch tasks | ||
1463 | * @rq: the runqueue preparing to switch | ||
1464 | * @next: the task we are going to switch to. | ||
1465 | * | ||
1466 | * This is called with the rq lock held and interrupts off. It must | ||
1467 | * be paired with a subsequent finish_task_switch after the context | ||
1468 | * switch. | ||
1469 | * | ||
1470 | * prepare_task_switch sets up locking and calls architecture specific | ||
1471 | * hooks. | ||
1472 | */ | ||
1473 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | ||
1474 | { | ||
1475 | prepare_lock_switch(rq, next); | ||
1476 | prepare_arch_switch(next); | ||
1477 | } | ||
1478 | |||
1479 | /** | ||
1299 | * finish_task_switch - clean up after a task-switch | 1480 | * finish_task_switch - clean up after a task-switch |
1300 | * @prev: the thread we just switched away from. | 1481 | * @prev: the thread we just switched away from. |
1301 | * | 1482 | * |
1302 | * We enter this with the runqueue still locked, and finish_arch_switch() | 1483 | * finish_task_switch must be called after the context switch, paired |
1303 | * will unlock it along with doing any other architecture-specific cleanup | 1484 | * with a prepare_task_switch call before the context switch. |
1304 | * actions. | 1485 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
1486 | * and do any other architecture-specific cleanup actions. | ||
1305 | * | 1487 | * |
1306 | * Note that we may have delayed dropping an mm in context_switch(). If | 1488 | * Note that we may have delayed dropping an mm in context_switch(). If |
1307 | * so, we finish that here outside of the runqueue lock. (Doing it | 1489 | * so, we finish that here outside of the runqueue lock. (Doing it |
1308 | * with the lock held can cause deadlocks; see schedule() for | 1490 | * with the lock held can cause deadlocks; see schedule() for |
1309 | * details.) | 1491 | * details.) |
1310 | */ | 1492 | */ |
1311 | static inline void finish_task_switch(task_t *prev) | 1493 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) |
1312 | __releases(rq->lock) | 1494 | __releases(rq->lock) |
1313 | { | 1495 | { |
1314 | runqueue_t *rq = this_rq(); | ||
1315 | struct mm_struct *mm = rq->prev_mm; | 1496 | struct mm_struct *mm = rq->prev_mm; |
1316 | unsigned long prev_task_flags; | 1497 | unsigned long prev_task_flags; |
1317 | 1498 | ||
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) | |||
1329 | * Manfred Spraul <manfred@colorfullife.com> | 1510 | * Manfred Spraul <manfred@colorfullife.com> |
1330 | */ | 1511 | */ |
1331 | prev_task_flags = prev->flags; | 1512 | prev_task_flags = prev->flags; |
1332 | finish_arch_switch(rq, prev); | 1513 | finish_arch_switch(prev); |
1514 | finish_lock_switch(rq, prev); | ||
1333 | if (mm) | 1515 | if (mm) |
1334 | mmdrop(mm); | 1516 | mmdrop(mm); |
1335 | if (unlikely(prev_task_flags & PF_DEAD)) | 1517 | if (unlikely(prev_task_flags & PF_DEAD)) |
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) | |||
1343 | asmlinkage void schedule_tail(task_t *prev) | 1525 | asmlinkage void schedule_tail(task_t *prev) |
1344 | __releases(rq->lock) | 1526 | __releases(rq->lock) |
1345 | { | 1527 | { |
1346 | finish_task_switch(prev); | 1528 | runqueue_t *rq = this_rq(); |
1347 | 1529 | finish_task_switch(rq, prev); | |
1530 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
1531 | /* In this case, finish_task_switch does not reenable preemption */ | ||
1532 | preempt_enable(); | ||
1533 | #endif | ||
1348 | if (current->set_child_tid) | 1534 | if (current->set_child_tid) |
1349 | put_user(current->pid, current->set_child_tid); | 1535 | put_user(current->pid, current->set_child_tid); |
1350 | } | 1536 | } |
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1494 | } | 1680 | } |
1495 | 1681 | ||
1496 | /* | 1682 | /* |
1497 | * find_idlest_cpu - find the least busy runqueue. | ||
1498 | */ | ||
1499 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1500 | struct sched_domain *sd) | ||
1501 | { | ||
1502 | unsigned long load, min_load, this_load; | ||
1503 | int i, min_cpu; | ||
1504 | cpumask_t mask; | ||
1505 | |||
1506 | min_cpu = UINT_MAX; | ||
1507 | min_load = ULONG_MAX; | ||
1508 | |||
1509 | cpus_and(mask, sd->span, p->cpus_allowed); | ||
1510 | |||
1511 | for_each_cpu_mask(i, mask) { | ||
1512 | load = target_load(i); | ||
1513 | |||
1514 | if (load < min_load) { | ||
1515 | min_cpu = i; | ||
1516 | min_load = load; | ||
1517 | |||
1518 | /* break out early on an idle CPU: */ | ||
1519 | if (!min_load) | ||
1520 | break; | ||
1521 | } | ||
1522 | } | ||
1523 | |||
1524 | /* add +1 to account for the new task */ | ||
1525 | this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; | ||
1526 | |||
1527 | /* | ||
1528 | * Would with the addition of the new task to the | ||
1529 | * current CPU there be an imbalance between this | ||
1530 | * CPU and the idlest CPU? | ||
1531 | * | ||
1532 | * Use half of the balancing threshold - new-context is | ||
1533 | * a good opportunity to balance. | ||
1534 | */ | ||
1535 | if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) | ||
1536 | return min_cpu; | ||
1537 | |||
1538 | return this_cpu; | ||
1539 | } | ||
1540 | |||
1541 | /* | ||
1542 | * If dest_cpu is allowed for this process, migrate the task to it. | 1683 | * If dest_cpu is allowed for this process, migrate the task to it. |
1543 | * This is accomplished by forcing the cpu_allowed mask to only | 1684 | * This is accomplished by forcing the cpu_allowed mask to only |
1544 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1685 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
@@ -1571,37 +1712,16 @@ out: | |||
1571 | } | 1712 | } |
1572 | 1713 | ||
1573 | /* | 1714 | /* |
1574 | * sched_exec(): find the highest-level, exec-balance-capable | 1715 | * sched_exec - execve() is a valuable balancing opportunity, because at |
1575 | * domain and try to migrate the task to the least loaded CPU. | 1716 | * this point the task has the smallest effective memory and cache footprint. |
1576 | * | ||
1577 | * execve() is a valuable balancing opportunity, because at this point | ||
1578 | * the task has the smallest effective memory and cache footprint. | ||
1579 | */ | 1717 | */ |
1580 | void sched_exec(void) | 1718 | void sched_exec(void) |
1581 | { | 1719 | { |
1582 | struct sched_domain *tmp, *sd = NULL; | ||
1583 | int new_cpu, this_cpu = get_cpu(); | 1720 | int new_cpu, this_cpu = get_cpu(); |
1584 | 1721 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | |
1585 | /* Prefer the current CPU if there's only this task running */ | ||
1586 | if (this_rq()->nr_running <= 1) | ||
1587 | goto out; | ||
1588 | |||
1589 | for_each_domain(this_cpu, tmp) | ||
1590 | if (tmp->flags & SD_BALANCE_EXEC) | ||
1591 | sd = tmp; | ||
1592 | |||
1593 | if (sd) { | ||
1594 | schedstat_inc(sd, sbe_attempts); | ||
1595 | new_cpu = find_idlest_cpu(current, this_cpu, sd); | ||
1596 | if (new_cpu != this_cpu) { | ||
1597 | schedstat_inc(sd, sbe_pushed); | ||
1598 | put_cpu(); | ||
1599 | sched_migrate_task(current, new_cpu); | ||
1600 | return; | ||
1601 | } | ||
1602 | } | ||
1603 | out: | ||
1604 | put_cpu(); | 1722 | put_cpu(); |
1723 | if (new_cpu != this_cpu) | ||
1724 | sched_migrate_task(current, new_cpu); | ||
1605 | } | 1725 | } |
1606 | 1726 | ||
1607 | /* | 1727 | /* |
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1632 | */ | 1752 | */ |
1633 | static inline | 1753 | static inline |
1634 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1754 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1635 | struct sched_domain *sd, enum idle_type idle) | 1755 | struct sched_domain *sd, enum idle_type idle, int *all_pinned) |
1636 | { | 1756 | { |
1637 | /* | 1757 | /* |
1638 | * We do not migrate tasks that are: | 1758 | * We do not migrate tasks that are: |
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1640 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 1760 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
1641 | * 3) are cache-hot on their current CPU. | 1761 | * 3) are cache-hot on their current CPU. |
1642 | */ | 1762 | */ |
1643 | if (task_running(rq, p)) | ||
1644 | return 0; | ||
1645 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 1763 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
1646 | return 0; | 1764 | return 0; |
1765 | *all_pinned = 0; | ||
1766 | |||
1767 | if (task_running(rq, p)) | ||
1768 | return 0; | ||
1647 | 1769 | ||
1648 | /* | 1770 | /* |
1649 | * Aggressive migration if: | 1771 | * Aggressive migration if: |
1650 | * 1) the [whole] cpu is idle, or | 1772 | * 1) task is cache cold, or |
1651 | * 2) too many balance attempts have failed. | 1773 | * 2) too many balance attempts have failed. |
1652 | */ | 1774 | */ |
1653 | 1775 | ||
1654 | if (cpu_and_siblings_are_idle(this_cpu) || \ | 1776 | if (sd->nr_balance_failed > sd->cache_nice_tries) |
1655 | sd->nr_balance_failed > sd->cache_nice_tries) | ||
1656 | return 1; | 1777 | return 1; |
1657 | 1778 | ||
1658 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 1779 | if (task_hot(p, rq->timestamp_last_tick, sd)) |
1659 | return 0; | 1780 | return 0; |
1660 | return 1; | 1781 | return 1; |
1661 | } | 1782 | } |
1662 | 1783 | ||
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1669 | */ | 1790 | */ |
1670 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 1791 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1671 | unsigned long max_nr_move, struct sched_domain *sd, | 1792 | unsigned long max_nr_move, struct sched_domain *sd, |
1672 | enum idle_type idle) | 1793 | enum idle_type idle, int *all_pinned) |
1673 | { | 1794 | { |
1674 | prio_array_t *array, *dst_array; | 1795 | prio_array_t *array, *dst_array; |
1675 | struct list_head *head, *curr; | 1796 | struct list_head *head, *curr; |
1676 | int idx, pulled = 0; | 1797 | int idx, pulled = 0, pinned = 0; |
1677 | task_t *tmp; | 1798 | task_t *tmp; |
1678 | 1799 | ||
1679 | if (max_nr_move <= 0 || busiest->nr_running <= 1) | 1800 | if (max_nr_move == 0) |
1680 | goto out; | 1801 | goto out; |
1681 | 1802 | ||
1803 | pinned = 1; | ||
1804 | |||
1682 | /* | 1805 | /* |
1683 | * We first consider expired tasks. Those will likely not be | 1806 | * We first consider expired tasks. Those will likely not be |
1684 | * executed in the near future, and they are most likely to | 1807 | * executed in the near future, and they are most likely to |
@@ -1717,7 +1840,7 @@ skip_queue: | |||
1717 | 1840 | ||
1718 | curr = curr->prev; | 1841 | curr = curr->prev; |
1719 | 1842 | ||
1720 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { | 1843 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { |
1721 | if (curr != head) | 1844 | if (curr != head) |
1722 | goto skip_queue; | 1845 | goto skip_queue; |
1723 | idx++; | 1846 | idx++; |
@@ -1746,6 +1869,9 @@ out: | |||
1746 | * inside pull_task(). | 1869 | * inside pull_task(). |
1747 | */ | 1870 | */ |
1748 | schedstat_add(sd, lb_gained[idle], pulled); | 1871 | schedstat_add(sd, lb_gained[idle], pulled); |
1872 | |||
1873 | if (all_pinned) | ||
1874 | *all_pinned = pinned; | ||
1749 | return pulled; | 1875 | return pulled; |
1750 | } | 1876 | } |
1751 | 1877 | ||
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1760 | { | 1886 | { |
1761 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1887 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1762 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1888 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1889 | int load_idx; | ||
1763 | 1890 | ||
1764 | max_load = this_load = total_load = total_pwr = 0; | 1891 | max_load = this_load = total_load = total_pwr = 0; |
1892 | if (idle == NOT_IDLE) | ||
1893 | load_idx = sd->busy_idx; | ||
1894 | else if (idle == NEWLY_IDLE) | ||
1895 | load_idx = sd->newidle_idx; | ||
1896 | else | ||
1897 | load_idx = sd->idle_idx; | ||
1765 | 1898 | ||
1766 | do { | 1899 | do { |
1767 | unsigned long load; | 1900 | unsigned long load; |
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1776 | for_each_cpu_mask(i, group->cpumask) { | 1909 | for_each_cpu_mask(i, group->cpumask) { |
1777 | /* Bias balancing toward cpus of our domain */ | 1910 | /* Bias balancing toward cpus of our domain */ |
1778 | if (local_group) | 1911 | if (local_group) |
1779 | load = target_load(i); | 1912 | load = target_load(i, load_idx); |
1780 | else | 1913 | else |
1781 | load = source_load(i); | 1914 | load = source_load(i, load_idx); |
1782 | 1915 | ||
1783 | avg_load += load; | 1916 | avg_load += load; |
1784 | } | 1917 | } |
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1792 | if (local_group) { | 1925 | if (local_group) { |
1793 | this_load = avg_load; | 1926 | this_load = avg_load; |
1794 | this = group; | 1927 | this = group; |
1795 | goto nextgroup; | ||
1796 | } else if (avg_load > max_load) { | 1928 | } else if (avg_load > max_load) { |
1797 | max_load = avg_load; | 1929 | max_load = avg_load; |
1798 | busiest = group; | 1930 | busiest = group; |
1799 | } | 1931 | } |
1800 | nextgroup: | ||
1801 | group = group->next; | 1932 | group = group->next; |
1802 | } while (group != sd->groups); | 1933 | } while (group != sd->groups); |
1803 | 1934 | ||
@@ -1870,15 +2001,9 @@ nextgroup: | |||
1870 | 2001 | ||
1871 | /* Get rid of the scaling factor, rounding down as we divide */ | 2002 | /* Get rid of the scaling factor, rounding down as we divide */ |
1872 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | 2003 | *imbalance = *imbalance / SCHED_LOAD_SCALE; |
1873 | |||
1874 | return busiest; | 2004 | return busiest; |
1875 | 2005 | ||
1876 | out_balanced: | 2006 | out_balanced: |
1877 | if (busiest && (idle == NEWLY_IDLE || | ||
1878 | (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { | ||
1879 | *imbalance = 1; | ||
1880 | return busiest; | ||
1881 | } | ||
1882 | 2007 | ||
1883 | *imbalance = 0; | 2008 | *imbalance = 0; |
1884 | return NULL; | 2009 | return NULL; |
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
1894 | int i; | 2019 | int i; |
1895 | 2020 | ||
1896 | for_each_cpu_mask(i, group->cpumask) { | 2021 | for_each_cpu_mask(i, group->cpumask) { |
1897 | load = source_load(i); | 2022 | load = source_load(i, 0); |
1898 | 2023 | ||
1899 | if (load > max_load) { | 2024 | if (load > max_load) { |
1900 | max_load = load; | 2025 | max_load = load; |
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
1906 | } | 2031 | } |
1907 | 2032 | ||
1908 | /* | 2033 | /* |
2034 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
2035 | * so long as it is large enough. | ||
2036 | */ | ||
2037 | #define MAX_PINNED_INTERVAL 512 | ||
2038 | |||
2039 | /* | ||
1909 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2040 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
1910 | * tasks if there is an imbalance. | 2041 | * tasks if there is an imbalance. |
1911 | * | 2042 | * |
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1917 | struct sched_group *group; | 2048 | struct sched_group *group; |
1918 | runqueue_t *busiest; | 2049 | runqueue_t *busiest; |
1919 | unsigned long imbalance; | 2050 | unsigned long imbalance; |
1920 | int nr_moved; | 2051 | int nr_moved, all_pinned = 0; |
2052 | int active_balance = 0; | ||
1921 | 2053 | ||
1922 | spin_lock(&this_rq->lock); | 2054 | spin_lock(&this_rq->lock); |
1923 | schedstat_inc(sd, lb_cnt[idle]); | 2055 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1934 | goto out_balanced; | 2066 | goto out_balanced; |
1935 | } | 2067 | } |
1936 | 2068 | ||
1937 | /* | 2069 | BUG_ON(busiest == this_rq); |
1938 | * This should be "impossible", but since load | ||
1939 | * balancing is inherently racy and statistical, | ||
1940 | * it could happen in theory. | ||
1941 | */ | ||
1942 | if (unlikely(busiest == this_rq)) { | ||
1943 | WARN_ON(1); | ||
1944 | goto out_balanced; | ||
1945 | } | ||
1946 | 2070 | ||
1947 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2071 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
1948 | 2072 | ||
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1956 | */ | 2080 | */ |
1957 | double_lock_balance(this_rq, busiest); | 2081 | double_lock_balance(this_rq, busiest); |
1958 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2082 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
1959 | imbalance, sd, idle); | 2083 | imbalance, sd, idle, |
2084 | &all_pinned); | ||
1960 | spin_unlock(&busiest->lock); | 2085 | spin_unlock(&busiest->lock); |
2086 | |||
2087 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
2088 | if (unlikely(all_pinned)) | ||
2089 | goto out_balanced; | ||
1961 | } | 2090 | } |
2091 | |||
1962 | spin_unlock(&this_rq->lock); | 2092 | spin_unlock(&this_rq->lock); |
1963 | 2093 | ||
1964 | if (!nr_moved) { | 2094 | if (!nr_moved) { |
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1966 | sd->nr_balance_failed++; | 2096 | sd->nr_balance_failed++; |
1967 | 2097 | ||
1968 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2098 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
1969 | int wake = 0; | ||
1970 | 2099 | ||
1971 | spin_lock(&busiest->lock); | 2100 | spin_lock(&busiest->lock); |
1972 | if (!busiest->active_balance) { | 2101 | if (!busiest->active_balance) { |
1973 | busiest->active_balance = 1; | 2102 | busiest->active_balance = 1; |
1974 | busiest->push_cpu = this_cpu; | 2103 | busiest->push_cpu = this_cpu; |
1975 | wake = 1; | 2104 | active_balance = 1; |
1976 | } | 2105 | } |
1977 | spin_unlock(&busiest->lock); | 2106 | spin_unlock(&busiest->lock); |
1978 | if (wake) | 2107 | if (active_balance) |
1979 | wake_up_process(busiest->migration_thread); | 2108 | wake_up_process(busiest->migration_thread); |
1980 | 2109 | ||
1981 | /* | 2110 | /* |
1982 | * We've kicked active balancing, reset the failure | 2111 | * We've kicked active balancing, reset the failure |
1983 | * counter. | 2112 | * counter. |
1984 | */ | 2113 | */ |
1985 | sd->nr_balance_failed = sd->cache_nice_tries; | 2114 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
1986 | } | 2115 | } |
1987 | 2116 | } else | |
1988 | /* | ||
1989 | * We were unbalanced, but unsuccessful in move_tasks(), | ||
1990 | * so bump the balance_interval to lessen the lock contention. | ||
1991 | */ | ||
1992 | if (sd->balance_interval < sd->max_interval) | ||
1993 | sd->balance_interval++; | ||
1994 | } else { | ||
1995 | sd->nr_balance_failed = 0; | 2117 | sd->nr_balance_failed = 0; |
1996 | 2118 | ||
2119 | if (likely(!active_balance)) { | ||
1997 | /* We were unbalanced, so reset the balancing interval */ | 2120 | /* We were unbalanced, so reset the balancing interval */ |
1998 | sd->balance_interval = sd->min_interval; | 2121 | sd->balance_interval = sd->min_interval; |
2122 | } else { | ||
2123 | /* | ||
2124 | * If we've begun active balancing, start to back off. This | ||
2125 | * case may not be covered by the all_pinned logic if there | ||
2126 | * is only 1 task on the busy runqueue (because we don't call | ||
2127 | * move_tasks). | ||
2128 | */ | ||
2129 | if (sd->balance_interval < sd->max_interval) | ||
2130 | sd->balance_interval *= 2; | ||
1999 | } | 2131 | } |
2000 | 2132 | ||
2001 | return nr_moved; | 2133 | return nr_moved; |
@@ -2005,8 +2137,10 @@ out_balanced: | |||
2005 | 2137 | ||
2006 | schedstat_inc(sd, lb_balanced[idle]); | 2138 | schedstat_inc(sd, lb_balanced[idle]); |
2007 | 2139 | ||
2140 | sd->nr_balance_failed = 0; | ||
2008 | /* tune up the balancing interval */ | 2141 | /* tune up the balancing interval */ |
2009 | if (sd->balance_interval < sd->max_interval) | 2142 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2143 | (sd->balance_interval < sd->max_interval)) | ||
2010 | sd->balance_interval *= 2; | 2144 | sd->balance_interval *= 2; |
2011 | 2145 | ||
2012 | return 0; | 2146 | return 0; |
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2030 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2164 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2031 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | 2165 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); |
2032 | if (!group) { | 2166 | if (!group) { |
2033 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2034 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2167 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2035 | goto out; | 2168 | goto out_balanced; |
2036 | } | 2169 | } |
2037 | 2170 | ||
2038 | busiest = find_busiest_queue(group); | 2171 | busiest = find_busiest_queue(group); |
2039 | if (!busiest || busiest == this_rq) { | 2172 | if (!busiest) { |
2040 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2041 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2173 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2042 | goto out; | 2174 | goto out_balanced; |
2043 | } | 2175 | } |
2044 | 2176 | ||
2177 | BUG_ON(busiest == this_rq); | ||
2178 | |||
2045 | /* Attempt to move tasks */ | 2179 | /* Attempt to move tasks */ |
2046 | double_lock_balance(this_rq, busiest); | 2180 | double_lock_balance(this_rq, busiest); |
2047 | 2181 | ||
2048 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2182 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
2049 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2183 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2050 | imbalance, sd, NEWLY_IDLE); | 2184 | imbalance, sd, NEWLY_IDLE, NULL); |
2051 | if (!nr_moved) | 2185 | if (!nr_moved) |
2052 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2186 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2187 | else | ||
2188 | sd->nr_balance_failed = 0; | ||
2053 | 2189 | ||
2054 | spin_unlock(&busiest->lock); | 2190 | spin_unlock(&busiest->lock); |
2055 | |||
2056 | out: | ||
2057 | return nr_moved; | 2191 | return nr_moved; |
2192 | |||
2193 | out_balanced: | ||
2194 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2195 | sd->nr_balance_failed = 0; | ||
2196 | return 0; | ||
2058 | } | 2197 | } |
2059 | 2198 | ||
2060 | /* | 2199 | /* |
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
2086 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2225 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) |
2087 | { | 2226 | { |
2088 | struct sched_domain *sd; | 2227 | struct sched_domain *sd; |
2089 | struct sched_group *cpu_group; | ||
2090 | runqueue_t *target_rq; | 2228 | runqueue_t *target_rq; |
2091 | cpumask_t visited_cpus; | 2229 | int target_cpu = busiest_rq->push_cpu; |
2092 | int cpu; | 2230 | |
2231 | if (busiest_rq->nr_running <= 1) | ||
2232 | /* no task to move */ | ||
2233 | return; | ||
2234 | |||
2235 | target_rq = cpu_rq(target_cpu); | ||
2093 | 2236 | ||
2094 | /* | 2237 | /* |
2095 | * Search for suitable CPUs to push tasks to in successively higher | 2238 | * This condition is "impossible", if it occurs |
2096 | * domains with SD_LOAD_BALANCE set. | 2239 | * we need to fix it. Originally reported by |
2240 | * Bjorn Helgaas on a 128-cpu setup. | ||
2097 | */ | 2241 | */ |
2098 | visited_cpus = CPU_MASK_NONE; | 2242 | BUG_ON(busiest_rq == target_rq); |
2099 | for_each_domain(busiest_cpu, sd) { | ||
2100 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
2101 | /* no more domains to search */ | ||
2102 | break; | ||
2103 | 2243 | ||
2104 | schedstat_inc(sd, alb_cnt); | 2244 | /* move a task from busiest_rq to target_rq */ |
2245 | double_lock_balance(busiest_rq, target_rq); | ||
2105 | 2246 | ||
2106 | cpu_group = sd->groups; | 2247 | /* Search for an sd spanning us and the target CPU. */ |
2107 | do { | 2248 | for_each_domain(target_cpu, sd) |
2108 | for_each_cpu_mask(cpu, cpu_group->cpumask) { | 2249 | if ((sd->flags & SD_LOAD_BALANCE) && |
2109 | if (busiest_rq->nr_running <= 1) | 2250 | cpu_isset(busiest_cpu, sd->span)) |
2110 | /* no more tasks left to move */ | 2251 | break; |
2111 | return; | 2252 | |
2112 | if (cpu_isset(cpu, visited_cpus)) | 2253 | if (unlikely(sd == NULL)) |
2113 | continue; | 2254 | goto out; |
2114 | cpu_set(cpu, visited_cpus); | 2255 | |
2115 | if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) | 2256 | schedstat_inc(sd, alb_cnt); |
2116 | continue; | 2257 | |
2117 | 2258 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | |
2118 | target_rq = cpu_rq(cpu); | 2259 | schedstat_inc(sd, alb_pushed); |
2119 | /* | 2260 | else |
2120 | * This condition is "impossible", if it occurs | 2261 | schedstat_inc(sd, alb_failed); |
2121 | * we need to fix it. Originally reported by | 2262 | out: |
2122 | * Bjorn Helgaas on a 128-cpu setup. | 2263 | spin_unlock(&target_rq->lock); |
2123 | */ | ||
2124 | BUG_ON(busiest_rq == target_rq); | ||
2125 | |||
2126 | /* move a task from busiest_rq to target_rq */ | ||
2127 | double_lock_balance(busiest_rq, target_rq); | ||
2128 | if (move_tasks(target_rq, cpu, busiest_rq, | ||
2129 | 1, sd, SCHED_IDLE)) { | ||
2130 | schedstat_inc(sd, alb_pushed); | ||
2131 | } else { | ||
2132 | schedstat_inc(sd, alb_failed); | ||
2133 | } | ||
2134 | spin_unlock(&target_rq->lock); | ||
2135 | } | ||
2136 | cpu_group = cpu_group->next; | ||
2137 | } while (cpu_group != sd->groups); | ||
2138 | } | ||
2139 | } | 2264 | } |
2140 | 2265 | ||
2141 | /* | 2266 | /* |
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2156 | unsigned long old_load, this_load; | 2281 | unsigned long old_load, this_load; |
2157 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | 2282 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); |
2158 | struct sched_domain *sd; | 2283 | struct sched_domain *sd; |
2284 | int i; | ||
2159 | 2285 | ||
2160 | /* Update our load */ | ||
2161 | old_load = this_rq->cpu_load; | ||
2162 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2286 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; |
2163 | /* | 2287 | /* Update our load */ |
2164 | * Round up the averaging division if load is increasing. This | 2288 | for (i = 0; i < 3; i++) { |
2165 | * prevents us from getting stuck on 9 if the load is 10, for | 2289 | unsigned long new_load = this_load; |
2166 | * example. | 2290 | int scale = 1 << i; |
2167 | */ | 2291 | old_load = this_rq->cpu_load[i]; |
2168 | if (this_load > old_load) | 2292 | /* |
2169 | old_load++; | 2293 | * Round up the averaging division if load is increasing. This |
2170 | this_rq->cpu_load = (old_load + this_load) / 2; | 2294 | * prevents us from getting stuck on 9 if the load is 10, for |
2295 | * example. | ||
2296 | */ | ||
2297 | if (new_load > old_load) | ||
2298 | new_load += scale-1; | ||
2299 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | ||
2300 | } | ||
2171 | 2301 | ||
2172 | for_each_domain(this_cpu, sd) { | 2302 | for_each_domain(this_cpu, sd) { |
2173 | unsigned long interval; | 2303 | unsigned long interval; |
@@ -2447,11 +2577,15 @@ out: | |||
2447 | #ifdef CONFIG_SCHED_SMT | 2577 | #ifdef CONFIG_SCHED_SMT |
2448 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2578 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2449 | { | 2579 | { |
2450 | struct sched_domain *sd = this_rq->sd; | 2580 | struct sched_domain *tmp, *sd = NULL; |
2451 | cpumask_t sibling_map; | 2581 | cpumask_t sibling_map; |
2452 | int i; | 2582 | int i; |
2453 | 2583 | ||
2454 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2584 | for_each_domain(this_cpu, tmp) |
2585 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
2586 | sd = tmp; | ||
2587 | |||
2588 | if (!sd) | ||
2455 | return; | 2589 | return; |
2456 | 2590 | ||
2457 | /* | 2591 | /* |
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2492 | 2626 | ||
2493 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2627 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2494 | { | 2628 | { |
2495 | struct sched_domain *sd = this_rq->sd; | 2629 | struct sched_domain *tmp, *sd = NULL; |
2496 | cpumask_t sibling_map; | 2630 | cpumask_t sibling_map; |
2497 | prio_array_t *array; | 2631 | prio_array_t *array; |
2498 | int ret = 0, i; | 2632 | int ret = 0, i; |
2499 | task_t *p; | 2633 | task_t *p; |
2500 | 2634 | ||
2501 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2635 | for_each_domain(this_cpu, tmp) |
2636 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
2637 | sd = tmp; | ||
2638 | |||
2639 | if (!sd) | ||
2502 | return 0; | 2640 | return 0; |
2503 | 2641 | ||
2504 | /* | 2642 | /* |
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val) | |||
2576 | /* | 2714 | /* |
2577 | * Underflow? | 2715 | * Underflow? |
2578 | */ | 2716 | */ |
2579 | BUG_ON(((int)preempt_count() < 0)); | 2717 | BUG_ON((preempt_count() < 0)); |
2580 | preempt_count() += val; | 2718 | preempt_count() += val; |
2581 | /* | 2719 | /* |
2582 | * Spinlock count overflowing soon? | 2720 | * Spinlock count overflowing soon? |
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void) | |||
2613 | struct list_head *queue; | 2751 | struct list_head *queue; |
2614 | unsigned long long now; | 2752 | unsigned long long now; |
2615 | unsigned long run_time; | 2753 | unsigned long run_time; |
2616 | int cpu, idx; | 2754 | int cpu, idx, new_prio; |
2617 | 2755 | ||
2618 | /* | 2756 | /* |
2619 | * Test if we are atomic. Since do_exit() needs to call into | 2757 | * Test if we are atomic. Since do_exit() needs to call into |
@@ -2735,9 +2873,14 @@ go_idle: | |||
2735 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 2873 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
2736 | 2874 | ||
2737 | array = next->array; | 2875 | array = next->array; |
2738 | dequeue_task(next, array); | 2876 | new_prio = recalc_task_prio(next, next->timestamp + delta); |
2739 | recalc_task_prio(next, next->timestamp + delta); | 2877 | |
2740 | enqueue_task(next, array); | 2878 | if (unlikely(next->prio != new_prio)) { |
2879 | dequeue_task(next, array); | ||
2880 | next->prio = new_prio; | ||
2881 | enqueue_task(next, array); | ||
2882 | } else | ||
2883 | requeue_task(next, array); | ||
2741 | } | 2884 | } |
2742 | next->activated = 0; | 2885 | next->activated = 0; |
2743 | switch_tasks: | 2886 | switch_tasks: |
@@ -2761,11 +2904,15 @@ switch_tasks: | |||
2761 | rq->curr = next; | 2904 | rq->curr = next; |
2762 | ++*switch_count; | 2905 | ++*switch_count; |
2763 | 2906 | ||
2764 | prepare_arch_switch(rq, next); | 2907 | prepare_task_switch(rq, next); |
2765 | prev = context_switch(rq, prev, next); | 2908 | prev = context_switch(rq, prev, next); |
2766 | barrier(); | 2909 | barrier(); |
2767 | 2910 | /* | |
2768 | finish_task_switch(prev); | 2911 | * this_rq must be evaluated again because prev may have moved |
2912 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
2913 | * frame will be invalid. | ||
2914 | */ | ||
2915 | finish_task_switch(this_rq(), prev); | ||
2769 | } else | 2916 | } else |
2770 | spin_unlock_irq(&rq->lock); | 2917 | spin_unlock_irq(&rq->lock); |
2771 | 2918 | ||
@@ -2869,7 +3016,7 @@ need_resched: | |||
2869 | 3016 | ||
2870 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | 3017 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) |
2871 | { | 3018 | { |
2872 | task_t *p = curr->task; | 3019 | task_t *p = curr->private; |
2873 | return try_to_wake_up(p, mode, sync); | 3020 | return try_to_wake_up(p, mode, sync); |
2874 | } | 3021 | } |
2875 | 3022 | ||
@@ -3301,15 +3448,7 @@ int task_nice(const task_t *p) | |||
3301 | { | 3448 | { |
3302 | return TASK_NICE(p); | 3449 | return TASK_NICE(p); |
3303 | } | 3450 | } |
3304 | |||
3305 | /* | ||
3306 | * The only users of task_nice are binfmt_elf and binfmt_elf32. | ||
3307 | * binfmt_elf is no longer modular, but binfmt_elf32 still is. | ||
3308 | * Therefore, task_nice is needed if there is a compat_mode. | ||
3309 | */ | ||
3310 | #ifdef CONFIG_COMPAT | ||
3311 | EXPORT_SYMBOL_GPL(task_nice); | 3451 | EXPORT_SYMBOL_GPL(task_nice); |
3312 | #endif | ||
3313 | 3452 | ||
3314 | /** | 3453 | /** |
3315 | * idle_cpu - is a given cpu idle currently? | 3454 | * idle_cpu - is a given cpu idle currently? |
@@ -3384,13 +3523,24 @@ recheck: | |||
3384 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3523 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) |
3385 | return -EINVAL; | 3524 | return -EINVAL; |
3386 | 3525 | ||
3387 | if ((policy == SCHED_FIFO || policy == SCHED_RR) && | 3526 | /* |
3388 | param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && | 3527 | * Allow unprivileged RT tasks to decrease priority: |
3389 | !capable(CAP_SYS_NICE)) | 3528 | */ |
3390 | return -EPERM; | 3529 | if (!capable(CAP_SYS_NICE)) { |
3391 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 3530 | /* can't change policy */ |
3392 | !capable(CAP_SYS_NICE)) | 3531 | if (policy != p->policy) |
3393 | return -EPERM; | 3532 | return -EPERM; |
3533 | /* can't increase priority */ | ||
3534 | if (policy != SCHED_NORMAL && | ||
3535 | param->sched_priority > p->rt_priority && | ||
3536 | param->sched_priority > | ||
3537 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
3538 | return -EPERM; | ||
3539 | /* can't change other user's priorities */ | ||
3540 | if ((current->euid != p->euid) && | ||
3541 | (current->euid != p->uid)) | ||
3542 | return -EPERM; | ||
3543 | } | ||
3394 | 3544 | ||
3395 | retval = security_task_setscheduler(p, policy, param); | 3545 | retval = security_task_setscheduler(p, policy, param); |
3396 | if (retval) | 3546 | if (retval) |
@@ -3814,7 +3964,7 @@ EXPORT_SYMBOL(yield); | |||
3814 | */ | 3964 | */ |
3815 | void __sched io_schedule(void) | 3965 | void __sched io_schedule(void) |
3816 | { | 3966 | { |
3817 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | 3967 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
3818 | 3968 | ||
3819 | atomic_inc(&rq->nr_iowait); | 3969 | atomic_inc(&rq->nr_iowait); |
3820 | schedule(); | 3970 | schedule(); |
@@ -3825,7 +3975,7 @@ EXPORT_SYMBOL(io_schedule); | |||
3825 | 3975 | ||
3826 | long __sched io_schedule_timeout(long timeout) | 3976 | long __sched io_schedule_timeout(long timeout) |
3827 | { | 3977 | { |
3828 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | 3978 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
3829 | long ret; | 3979 | long ret; |
3830 | 3980 | ||
3831 | atomic_inc(&rq->nr_iowait); | 3981 | atomic_inc(&rq->nr_iowait); |
@@ -4016,6 +4166,14 @@ void show_state(void) | |||
4016 | read_unlock(&tasklist_lock); | 4166 | read_unlock(&tasklist_lock); |
4017 | } | 4167 | } |
4018 | 4168 | ||
4169 | /** | ||
4170 | * init_idle - set up an idle thread for a given CPU | ||
4171 | * @idle: task in question | ||
4172 | * @cpu: cpu the idle task belongs to | ||
4173 | * | ||
4174 | * NOTE: this function does not set the idle thread's NEED_RESCHED | ||
4175 | * flag, to make booting more robust. | ||
4176 | */ | ||
4019 | void __devinit init_idle(task_t *idle, int cpu) | 4177 | void __devinit init_idle(task_t *idle, int cpu) |
4020 | { | 4178 | { |
4021 | runqueue_t *rq = cpu_rq(cpu); | 4179 | runqueue_t *rq = cpu_rq(cpu); |
@@ -4030,7 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4030 | 4188 | ||
4031 | spin_lock_irqsave(&rq->lock, flags); | 4189 | spin_lock_irqsave(&rq->lock, flags); |
4032 | rq->curr = rq->idle = idle; | 4190 | rq->curr = rq->idle = idle; |
4033 | set_tsk_need_resched(idle); | 4191 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
4192 | idle->oncpu = 1; | ||
4193 | #endif | ||
4034 | spin_unlock_irqrestore(&rq->lock, flags); | 4194 | spin_unlock_irqrestore(&rq->lock, flags); |
4035 | 4195 | ||
4036 | /* Set the preempt count _outside_ the spinlocks! */ | 4196 | /* Set the preempt count _outside_ the spinlocks! */ |
@@ -4174,8 +4334,7 @@ static int migration_thread(void * data) | |||
4174 | struct list_head *head; | 4334 | struct list_head *head; |
4175 | migration_req_t *req; | 4335 | migration_req_t *req; |
4176 | 4336 | ||
4177 | if (current->flags & PF_FREEZE) | 4337 | try_to_freeze(); |
4178 | refrigerator(PF_FREEZE); | ||
4179 | 4338 | ||
4180 | spin_lock_irq(&rq->lock); | 4339 | spin_lock_irq(&rq->lock); |
4181 | 4340 | ||
@@ -4200,17 +4359,9 @@ static int migration_thread(void * data) | |||
4200 | req = list_entry(head->next, migration_req_t, list); | 4359 | req = list_entry(head->next, migration_req_t, list); |
4201 | list_del_init(head->next); | 4360 | list_del_init(head->next); |
4202 | 4361 | ||
4203 | if (req->type == REQ_MOVE_TASK) { | 4362 | spin_unlock(&rq->lock); |
4204 | spin_unlock(&rq->lock); | 4363 | __migrate_task(req->task, cpu, req->dest_cpu); |
4205 | __migrate_task(req->task, cpu, req->dest_cpu); | 4364 | local_irq_enable(); |
4206 | local_irq_enable(); | ||
4207 | } else if (req->type == REQ_SET_DOMAIN) { | ||
4208 | rq->sd = req->sd; | ||
4209 | spin_unlock_irq(&rq->lock); | ||
4210 | } else { | ||
4211 | spin_unlock_irq(&rq->lock); | ||
4212 | WARN_ON(1); | ||
4213 | } | ||
4214 | 4365 | ||
4215 | complete(&req->done); | 4366 | complete(&req->done); |
4216 | } | 4367 | } |
@@ -4441,7 +4592,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4441 | migration_req_t *req; | 4592 | migration_req_t *req; |
4442 | req = list_entry(rq->migration_queue.next, | 4593 | req = list_entry(rq->migration_queue.next, |
4443 | migration_req_t, list); | 4594 | migration_req_t, list); |
4444 | BUG_ON(req->type != REQ_MOVE_TASK); | ||
4445 | list_del_init(&req->list); | 4595 | list_del_init(&req->list); |
4446 | complete(&req->done); | 4596 | complete(&req->done); |
4447 | } | 4597 | } |
@@ -4472,12 +4622,17 @@ int __init migration_init(void) | |||
4472 | #endif | 4622 | #endif |
4473 | 4623 | ||
4474 | #ifdef CONFIG_SMP | 4624 | #ifdef CONFIG_SMP |
4475 | #define SCHED_DOMAIN_DEBUG | 4625 | #undef SCHED_DOMAIN_DEBUG |
4476 | #ifdef SCHED_DOMAIN_DEBUG | 4626 | #ifdef SCHED_DOMAIN_DEBUG |
4477 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 4627 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
4478 | { | 4628 | { |
4479 | int level = 0; | 4629 | int level = 0; |
4480 | 4630 | ||
4631 | if (!sd) { | ||
4632 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
4633 | return; | ||
4634 | } | ||
4635 | |||
4481 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 4636 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
4482 | 4637 | ||
4483 | do { | 4638 | do { |
@@ -4560,37 +4715,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
4560 | #define sched_domain_debug(sd, cpu) {} | 4715 | #define sched_domain_debug(sd, cpu) {} |
4561 | #endif | 4716 | #endif |
4562 | 4717 | ||
4718 | static int sd_degenerate(struct sched_domain *sd) | ||
4719 | { | ||
4720 | if (cpus_weight(sd->span) == 1) | ||
4721 | return 1; | ||
4722 | |||
4723 | /* Following flags need at least 2 groups */ | ||
4724 | if (sd->flags & (SD_LOAD_BALANCE | | ||
4725 | SD_BALANCE_NEWIDLE | | ||
4726 | SD_BALANCE_FORK | | ||
4727 | SD_BALANCE_EXEC)) { | ||
4728 | if (sd->groups != sd->groups->next) | ||
4729 | return 0; | ||
4730 | } | ||
4731 | |||
4732 | /* Following flags don't use groups */ | ||
4733 | if (sd->flags & (SD_WAKE_IDLE | | ||
4734 | SD_WAKE_AFFINE | | ||
4735 | SD_WAKE_BALANCE)) | ||
4736 | return 0; | ||
4737 | |||
4738 | return 1; | ||
4739 | } | ||
4740 | |||
4741 | static int sd_parent_degenerate(struct sched_domain *sd, | ||
4742 | struct sched_domain *parent) | ||
4743 | { | ||
4744 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
4745 | |||
4746 | if (sd_degenerate(parent)) | ||
4747 | return 1; | ||
4748 | |||
4749 | if (!cpus_equal(sd->span, parent->span)) | ||
4750 | return 0; | ||
4751 | |||
4752 | /* Does parent contain flags not in child? */ | ||
4753 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
4754 | if (cflags & SD_WAKE_AFFINE) | ||
4755 | pflags &= ~SD_WAKE_BALANCE; | ||
4756 | /* Flags needing groups don't count if only 1 group in parent */ | ||
4757 | if (parent->groups == parent->groups->next) { | ||
4758 | pflags &= ~(SD_LOAD_BALANCE | | ||
4759 | SD_BALANCE_NEWIDLE | | ||
4760 | SD_BALANCE_FORK | | ||
4761 | SD_BALANCE_EXEC); | ||
4762 | } | ||
4763 | if (~cflags & pflags) | ||
4764 | return 0; | ||
4765 | |||
4766 | return 1; | ||
4767 | } | ||
4768 | |||
4563 | /* | 4769 | /* |
4564 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4770 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4565 | * hold the hotplug lock. | 4771 | * hold the hotplug lock. |
4566 | */ | 4772 | */ |
4567 | void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) | 4773 | void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4568 | { | 4774 | { |
4569 | migration_req_t req; | ||
4570 | unsigned long flags; | ||
4571 | runqueue_t *rq = cpu_rq(cpu); | 4775 | runqueue_t *rq = cpu_rq(cpu); |
4572 | int local = 1; | 4776 | struct sched_domain *tmp; |
4573 | |||
4574 | sched_domain_debug(sd, cpu); | ||
4575 | 4777 | ||
4576 | spin_lock_irqsave(&rq->lock, flags); | 4778 | /* Remove the sched domains which do not contribute to scheduling. */ |
4577 | 4779 | for (tmp = sd; tmp; tmp = tmp->parent) { | |
4578 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | 4780 | struct sched_domain *parent = tmp->parent; |
4579 | rq->sd = sd; | 4781 | if (!parent) |
4580 | } else { | 4782 | break; |
4581 | init_completion(&req.done); | 4783 | if (sd_parent_degenerate(tmp, parent)) |
4582 | req.type = REQ_SET_DOMAIN; | 4784 | tmp->parent = parent->parent; |
4583 | req.sd = sd; | ||
4584 | list_add(&req.list, &rq->migration_queue); | ||
4585 | local = 0; | ||
4586 | } | 4785 | } |
4587 | 4786 | ||
4588 | spin_unlock_irqrestore(&rq->lock, flags); | 4787 | if (sd && sd_degenerate(sd)) |
4788 | sd = sd->parent; | ||
4589 | 4789 | ||
4590 | if (!local) { | 4790 | sched_domain_debug(sd, cpu); |
4591 | wake_up_process(rq->migration_thread); | 4791 | |
4592 | wait_for_completion(&req.done); | 4792 | rcu_assign_pointer(rq->sd, sd); |
4593 | } | ||
4594 | } | 4793 | } |
4595 | 4794 | ||
4596 | /* cpus with isolated domains */ | 4795 | /* cpus with isolated domains */ |
@@ -4622,7 +4821,7 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
4622 | * covered by the given span, and will set each group's ->cpumask correctly, | 4821 | * covered by the given span, and will set each group's ->cpumask correctly, |
4623 | * and ->cpu_power to 0. | 4822 | * and ->cpu_power to 0. |
4624 | */ | 4823 | */ |
4625 | void __devinit init_sched_build_groups(struct sched_group groups[], | 4824 | void init_sched_build_groups(struct sched_group groups[], |
4626 | cpumask_t span, int (*group_fn)(int cpu)) | 4825 | cpumask_t span, int (*group_fn)(int cpu)) |
4627 | { | 4826 | { |
4628 | struct sched_group *first = NULL, *last = NULL; | 4827 | struct sched_group *first = NULL, *last = NULL; |
@@ -4658,13 +4857,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], | |||
4658 | 4857 | ||
4659 | 4858 | ||
4660 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4859 | #ifdef ARCH_HAS_SCHED_DOMAIN |
4661 | extern void __devinit arch_init_sched_domains(void); | 4860 | extern void build_sched_domains(const cpumask_t *cpu_map); |
4662 | extern void __devinit arch_destroy_sched_domains(void); | 4861 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); |
4862 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | ||
4663 | #else | 4863 | #else |
4664 | #ifdef CONFIG_SCHED_SMT | 4864 | #ifdef CONFIG_SCHED_SMT |
4665 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 4865 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
4666 | static struct sched_group sched_group_cpus[NR_CPUS]; | 4866 | static struct sched_group sched_group_cpus[NR_CPUS]; |
4667 | static int __devinit cpu_to_cpu_group(int cpu) | 4867 | static int cpu_to_cpu_group(int cpu) |
4668 | { | 4868 | { |
4669 | return cpu; | 4869 | return cpu; |
4670 | } | 4870 | } |
@@ -4672,7 +4872,7 @@ static int __devinit cpu_to_cpu_group(int cpu) | |||
4672 | 4872 | ||
4673 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 4873 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
4674 | static struct sched_group sched_group_phys[NR_CPUS]; | 4874 | static struct sched_group sched_group_phys[NR_CPUS]; |
4675 | static int __devinit cpu_to_phys_group(int cpu) | 4875 | static int cpu_to_phys_group(int cpu) |
4676 | { | 4876 | { |
4677 | #ifdef CONFIG_SCHED_SMT | 4877 | #ifdef CONFIG_SCHED_SMT |
4678 | return first_cpu(cpu_sibling_map[cpu]); | 4878 | return first_cpu(cpu_sibling_map[cpu]); |
@@ -4685,7 +4885,7 @@ static int __devinit cpu_to_phys_group(int cpu) | |||
4685 | 4885 | ||
4686 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 4886 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
4687 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | 4887 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; |
4688 | static int __devinit cpu_to_node_group(int cpu) | 4888 | static int cpu_to_node_group(int cpu) |
4689 | { | 4889 | { |
4690 | return cpu_to_node(cpu); | 4890 | return cpu_to_node(cpu); |
4691 | } | 4891 | } |
@@ -4716,39 +4916,28 @@ static void check_sibling_maps(void) | |||
4716 | #endif | 4916 | #endif |
4717 | 4917 | ||
4718 | /* | 4918 | /* |
4719 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 4919 | * Build sched domains for a given set of cpus and attach the sched domains |
4920 | * to the individual cpus | ||
4720 | */ | 4921 | */ |
4721 | static void __devinit arch_init_sched_domains(void) | 4922 | static void build_sched_domains(const cpumask_t *cpu_map) |
4722 | { | 4923 | { |
4723 | int i; | 4924 | int i; |
4724 | cpumask_t cpu_default_map; | ||
4725 | |||
4726 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4727 | check_sibling_maps(); | ||
4728 | #endif | ||
4729 | /* | ||
4730 | * Setup mask for cpus without special case scheduling requirements. | ||
4731 | * For now this just excludes isolated cpus, but could be used to | ||
4732 | * exclude other special cases in the future. | ||
4733 | */ | ||
4734 | cpus_complement(cpu_default_map, cpu_isolated_map); | ||
4735 | cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); | ||
4736 | 4925 | ||
4737 | /* | 4926 | /* |
4738 | * Set up domains. Isolated domains just stay on the dummy domain. | 4927 | * Set up domains for cpus specified by the cpu_map. |
4739 | */ | 4928 | */ |
4740 | for_each_cpu_mask(i, cpu_default_map) { | 4929 | for_each_cpu_mask(i, *cpu_map) { |
4741 | int group; | 4930 | int group; |
4742 | struct sched_domain *sd = NULL, *p; | 4931 | struct sched_domain *sd = NULL, *p; |
4743 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 4932 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
4744 | 4933 | ||
4745 | cpus_and(nodemask, nodemask, cpu_default_map); | 4934 | cpus_and(nodemask, nodemask, *cpu_map); |
4746 | 4935 | ||
4747 | #ifdef CONFIG_NUMA | 4936 | #ifdef CONFIG_NUMA |
4748 | sd = &per_cpu(node_domains, i); | 4937 | sd = &per_cpu(node_domains, i); |
4749 | group = cpu_to_node_group(i); | 4938 | group = cpu_to_node_group(i); |
4750 | *sd = SD_NODE_INIT; | 4939 | *sd = SD_NODE_INIT; |
4751 | sd->span = cpu_default_map; | 4940 | sd->span = *cpu_map; |
4752 | sd->groups = &sched_group_nodes[group]; | 4941 | sd->groups = &sched_group_nodes[group]; |
4753 | #endif | 4942 | #endif |
4754 | 4943 | ||
@@ -4766,7 +4955,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4766 | group = cpu_to_cpu_group(i); | 4955 | group = cpu_to_cpu_group(i); |
4767 | *sd = SD_SIBLING_INIT; | 4956 | *sd = SD_SIBLING_INIT; |
4768 | sd->span = cpu_sibling_map[i]; | 4957 | sd->span = cpu_sibling_map[i]; |
4769 | cpus_and(sd->span, sd->span, cpu_default_map); | 4958 | cpus_and(sd->span, sd->span, *cpu_map); |
4770 | sd->parent = p; | 4959 | sd->parent = p; |
4771 | sd->groups = &sched_group_cpus[group]; | 4960 | sd->groups = &sched_group_cpus[group]; |
4772 | #endif | 4961 | #endif |
@@ -4776,7 +4965,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4776 | /* Set up CPU (sibling) groups */ | 4965 | /* Set up CPU (sibling) groups */ |
4777 | for_each_online_cpu(i) { | 4966 | for_each_online_cpu(i) { |
4778 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 4967 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
4779 | cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); | 4968 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
4780 | if (i != first_cpu(this_sibling_map)) | 4969 | if (i != first_cpu(this_sibling_map)) |
4781 | continue; | 4970 | continue; |
4782 | 4971 | ||
@@ -4789,7 +4978,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4789 | for (i = 0; i < MAX_NUMNODES; i++) { | 4978 | for (i = 0; i < MAX_NUMNODES; i++) { |
4790 | cpumask_t nodemask = node_to_cpumask(i); | 4979 | cpumask_t nodemask = node_to_cpumask(i); |
4791 | 4980 | ||
4792 | cpus_and(nodemask, nodemask, cpu_default_map); | 4981 | cpus_and(nodemask, nodemask, *cpu_map); |
4793 | if (cpus_empty(nodemask)) | 4982 | if (cpus_empty(nodemask)) |
4794 | continue; | 4983 | continue; |
4795 | 4984 | ||
@@ -4799,12 +4988,12 @@ static void __devinit arch_init_sched_domains(void) | |||
4799 | 4988 | ||
4800 | #ifdef CONFIG_NUMA | 4989 | #ifdef CONFIG_NUMA |
4801 | /* Set up node groups */ | 4990 | /* Set up node groups */ |
4802 | init_sched_build_groups(sched_group_nodes, cpu_default_map, | 4991 | init_sched_build_groups(sched_group_nodes, *cpu_map, |
4803 | &cpu_to_node_group); | 4992 | &cpu_to_node_group); |
4804 | #endif | 4993 | #endif |
4805 | 4994 | ||
4806 | /* Calculate CPU power for physical packages and nodes */ | 4995 | /* Calculate CPU power for physical packages and nodes */ |
4807 | for_each_cpu_mask(i, cpu_default_map) { | 4996 | for_each_cpu_mask(i, *cpu_map) { |
4808 | int power; | 4997 | int power; |
4809 | struct sched_domain *sd; | 4998 | struct sched_domain *sd; |
4810 | #ifdef CONFIG_SCHED_SMT | 4999 | #ifdef CONFIG_SCHED_SMT |
@@ -4828,7 +5017,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4828 | } | 5017 | } |
4829 | 5018 | ||
4830 | /* Attach the domains */ | 5019 | /* Attach the domains */ |
4831 | for_each_online_cpu(i) { | 5020 | for_each_cpu_mask(i, *cpu_map) { |
4832 | struct sched_domain *sd; | 5021 | struct sched_domain *sd; |
4833 | #ifdef CONFIG_SCHED_SMT | 5022 | #ifdef CONFIG_SCHED_SMT |
4834 | sd = &per_cpu(cpu_domains, i); | 5023 | sd = &per_cpu(cpu_domains, i); |
@@ -4838,41 +5027,85 @@ static void __devinit arch_init_sched_domains(void) | |||
4838 | cpu_attach_domain(sd, i); | 5027 | cpu_attach_domain(sd, i); |
4839 | } | 5028 | } |
4840 | } | 5029 | } |
5030 | /* | ||
5031 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
5032 | */ | ||
5033 | static void arch_init_sched_domains(cpumask_t *cpu_map) | ||
5034 | { | ||
5035 | cpumask_t cpu_default_map; | ||
5036 | |||
5037 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
5038 | check_sibling_maps(); | ||
5039 | #endif | ||
5040 | /* | ||
5041 | * Setup mask for cpus without special case scheduling requirements. | ||
5042 | * For now this just excludes isolated cpus, but could be used to | ||
5043 | * exclude other special cases in the future. | ||
5044 | */ | ||
5045 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | ||
5046 | |||
5047 | build_sched_domains(&cpu_default_map); | ||
5048 | } | ||
4841 | 5049 | ||
4842 | #ifdef CONFIG_HOTPLUG_CPU | 5050 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
4843 | static void __devinit arch_destroy_sched_domains(void) | ||
4844 | { | 5051 | { |
4845 | /* Do nothing: everything is statically allocated. */ | 5052 | /* Do nothing: everything is statically allocated. */ |
4846 | } | 5053 | } |
4847 | #endif | ||
4848 | 5054 | ||
4849 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5055 | #endif /* ARCH_HAS_SCHED_DOMAIN */ |
4850 | 5056 | ||
4851 | /* | 5057 | /* |
4852 | * Initial dummy domain for early boot and for hotplug cpu. Being static, | 5058 | * Detach sched domains from a group of cpus specified in cpu_map |
4853 | * it is initialized to zero, so all balancing flags are cleared which is | 5059 | * These cpus will now be attached to the NULL domain |
4854 | * what we want. | ||
4855 | */ | 5060 | */ |
4856 | static struct sched_domain sched_domain_dummy; | 5061 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) |
5062 | { | ||
5063 | int i; | ||
5064 | |||
5065 | for_each_cpu_mask(i, *cpu_map) | ||
5066 | cpu_attach_domain(NULL, i); | ||
5067 | synchronize_sched(); | ||
5068 | arch_destroy_sched_domains(cpu_map); | ||
5069 | } | ||
5070 | |||
5071 | /* | ||
5072 | * Partition sched domains as specified by the cpumasks below. | ||
5073 | * This attaches all cpus from the cpumasks to the NULL domain, | ||
5074 | * waits for a RCU quiescent period, recalculates sched | ||
5075 | * domain information and then attaches them back to the | ||
5076 | * correct sched domains | ||
5077 | * Call with hotplug lock held | ||
5078 | */ | ||
5079 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | ||
5080 | { | ||
5081 | cpumask_t change_map; | ||
5082 | |||
5083 | cpus_and(*partition1, *partition1, cpu_online_map); | ||
5084 | cpus_and(*partition2, *partition2, cpu_online_map); | ||
5085 | cpus_or(change_map, *partition1, *partition2); | ||
5086 | |||
5087 | /* Detach sched domains from all of the affected cpus */ | ||
5088 | detach_destroy_domains(&change_map); | ||
5089 | if (!cpus_empty(*partition1)) | ||
5090 | build_sched_domains(partition1); | ||
5091 | if (!cpus_empty(*partition2)) | ||
5092 | build_sched_domains(partition2); | ||
5093 | } | ||
4857 | 5094 | ||
4858 | #ifdef CONFIG_HOTPLUG_CPU | 5095 | #ifdef CONFIG_HOTPLUG_CPU |
4859 | /* | 5096 | /* |
4860 | * Force a reinitialization of the sched domains hierarchy. The domains | 5097 | * Force a reinitialization of the sched domains hierarchy. The domains |
4861 | * and groups cannot be updated in place without racing with the balancing | 5098 | * and groups cannot be updated in place without racing with the balancing |
4862 | * code, so we temporarily attach all running cpus to a "dummy" domain | 5099 | * code, so we temporarily attach all running cpus to the NULL domain |
4863 | * which will prevent rebalancing while the sched domains are recalculated. | 5100 | * which will prevent rebalancing while the sched domains are recalculated. |
4864 | */ | 5101 | */ |
4865 | static int update_sched_domains(struct notifier_block *nfb, | 5102 | static int update_sched_domains(struct notifier_block *nfb, |
4866 | unsigned long action, void *hcpu) | 5103 | unsigned long action, void *hcpu) |
4867 | { | 5104 | { |
4868 | int i; | ||
4869 | |||
4870 | switch (action) { | 5105 | switch (action) { |
4871 | case CPU_UP_PREPARE: | 5106 | case CPU_UP_PREPARE: |
4872 | case CPU_DOWN_PREPARE: | 5107 | case CPU_DOWN_PREPARE: |
4873 | for_each_online_cpu(i) | 5108 | detach_destroy_domains(&cpu_online_map); |
4874 | cpu_attach_domain(&sched_domain_dummy, i); | ||
4875 | arch_destroy_sched_domains(); | ||
4876 | return NOTIFY_OK; | 5109 | return NOTIFY_OK; |
4877 | 5110 | ||
4878 | case CPU_UP_CANCELED: | 5111 | case CPU_UP_CANCELED: |
@@ -4888,7 +5121,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
4888 | } | 5121 | } |
4889 | 5122 | ||
4890 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 5123 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
4891 | arch_init_sched_domains(); | 5124 | arch_init_sched_domains(&cpu_online_map); |
4892 | 5125 | ||
4893 | return NOTIFY_OK; | 5126 | return NOTIFY_OK; |
4894 | } | 5127 | } |
@@ -4897,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
4897 | void __init sched_init_smp(void) | 5130 | void __init sched_init_smp(void) |
4898 | { | 5131 | { |
4899 | lock_cpu_hotplug(); | 5132 | lock_cpu_hotplug(); |
4900 | arch_init_sched_domains(); | 5133 | arch_init_sched_domains(&cpu_online_map); |
4901 | unlock_cpu_hotplug(); | 5134 | unlock_cpu_hotplug(); |
4902 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 5135 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
4903 | hotcpu_notifier(update_sched_domains, 0); | 5136 | hotcpu_notifier(update_sched_domains, 0); |
@@ -4927,13 +5160,15 @@ void __init sched_init(void) | |||
4927 | 5160 | ||
4928 | rq = cpu_rq(i); | 5161 | rq = cpu_rq(i); |
4929 | spin_lock_init(&rq->lock); | 5162 | spin_lock_init(&rq->lock); |
5163 | rq->nr_running = 0; | ||
4930 | rq->active = rq->arrays; | 5164 | rq->active = rq->arrays; |
4931 | rq->expired = rq->arrays + 1; | 5165 | rq->expired = rq->arrays + 1; |
4932 | rq->best_expired_prio = MAX_PRIO; | 5166 | rq->best_expired_prio = MAX_PRIO; |
4933 | 5167 | ||
4934 | #ifdef CONFIG_SMP | 5168 | #ifdef CONFIG_SMP |
4935 | rq->sd = &sched_domain_dummy; | 5169 | rq->sd = NULL; |
4936 | rq->cpu_load = 0; | 5170 | for (j = 1; j < 3; j++) |
5171 | rq->cpu_load[j] = 0; | ||
4937 | rq->active_balance = 0; | 5172 | rq->active_balance = 0; |
4938 | rq->push_cpu = 0; | 5173 | rq->push_cpu = 0; |
4939 | rq->migration_thread = NULL; | 5174 | rq->migration_thread = NULL; |