aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/feature-removal-schedule.txt15
-rw-r--r--Documentation/filesystems/proc.txt3
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--fs/proc/array.c11
-rw-r--r--fs/proc/stat.c19
-rw-r--r--include/linux/kernel_stat.h1
-rw-r--r--include/linux/sched.h12
-rw-r--r--kernel/cpuset.c27
-rw-r--r--kernel/kgdb.c2
-rw-r--r--kernel/sched.c180
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c65
-rw-r--r--kernel/sched_rt.c61
14 files changed, 263 insertions, 144 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index bc693fffabe0..f613df8ec7bf 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,21 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: USER_SCHED
10When: 2.6.34
11
12Why: USER_SCHED was implemented as a proof of concept for group scheduling.
13 The effect of USER_SCHED can already be achieved from userspace with
14 the help of libcgroup. The removal of USER_SCHED will also simplify
15 the scheduler code with the removal of one major ifdef. There are also
16 issues USER_SCHED has with USER_NS. A decision was taken not to fix
17 those and instead remove USER_SCHED. Also new group scheduling
18 features will not be implemented for USER_SCHED.
19
20Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
21
22---------------------------
23
9What: PRISM54 24What: PRISM54
10When: 2.6.34 25When: 2.6.34
11 26
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546b..4af0018533f2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second). The meanings of the columns are as follows, from left to right:
1072- irq: servicing interrupts 1072- irq: servicing interrupts
1073- softirq: servicing softirqs 1073- softirq: servicing softirqs
1074- steal: involuntary wait 1074- steal: involuntary wait
1075- guest: running a guest 1075- guest: running a normal guest
1076- guest_nice: running a niced guest
1076 1077
1077The "intr" line gives counts of interrupts serviced since boot time, for each 1078The "intr" line gives counts of interrupts serviced since boot time, for each
1078of the possible system interrupts. The first column is the total of all 1079of the possible system interrupts. The first column is the total of all
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..f2a9507b27b2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file
2182 2182
2183 sbni= [NET] Granch SBNI12 leased line adapter 2183 sbni= [NET] Granch SBNI12 leased line adapter
2184 2184
2185 sched_debug [KNL] Enables verbose scheduler debug messages.
2186
2185 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver 2187 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
2186 Format: <io>[,<timeout>[,<isapnp>]] 2188 Format: <io>[,<timeout>[,<isapnp>]]
2187 2189
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..93c501dc2496 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -406,11 +406,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
406 * This won't pick up thread selector changes, but I guess that is ok. 406 * This won't pick up thread selector changes, but I guess that is ok.
407 */ 407 */
408 savesegment(es, prev->es); 408 savesegment(es, prev->es);
409 if (unlikely(next->es | prev->es)) 409 if (next->es | prev->es)
410 loadsegment(es, next->es); 410 loadsegment(es, next->es);
411
412 savesegment(ds, prev->ds); 411 savesegment(ds, prev->ds);
413 if (unlikely(next->ds | prev->ds)) 412 if (next->ds | prev->ds)
414 loadsegment(ds, next->ds); 413 loadsegment(ds, next->ds);
415 414
416 415
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07f77a7945c3..e209f64ab27b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,6 +410,16 @@ static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
410} 410}
411#endif /* CONFIG_MMU */ 411#endif /* CONFIG_MMU */
412 412
413static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
414{
415 seq_printf(m, "Cpus_allowed:\t");
416 seq_cpumask(m, &task->cpus_allowed);
417 seq_printf(m, "\n");
418 seq_printf(m, "Cpus_allowed_list:\t");
419 seq_cpumask_list(m, &task->cpus_allowed);
420 seq_printf(m, "\n");
421}
422
413int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 423int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
414 struct pid *pid, struct task_struct *task) 424 struct pid *pid, struct task_struct *task)
415{ 425{
@@ -424,6 +434,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
424 } 434 }
425 task_sig(m, task); 435 task_sig(m, task);
426 task_cap(m, task); 436 task_cap(m, task);
437 task_cpus_allowed(m, task);
427 cpuset_task_status_allowed(m, task); 438 cpuset_task_status_allowed(m, task);
428#if defined(CONFIG_S390) 439#if defined(CONFIG_S390)
429 task_show_regs(m, task); 440 task_show_regs(m, task);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
27 int i, j; 27 int i, j;
28 unsigned long jif; 28 unsigned long jif;
29 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 29 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
30 cputime64_t guest; 30 cputime64_t guest, guest_nice;
31 u64 sum = 0; 31 u64 sum = 0;
32 u64 sum_softirq = 0; 32 u64 sum_softirq = 0;
33 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 33 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
36 36
37 user = nice = system = idle = iowait = 37 user = nice = system = idle = iowait =
38 irq = softirq = steal = cputime64_zero; 38 irq = softirq = steal = cputime64_zero;
39 guest = cputime64_zero; 39 guest = guest_nice = cputime64_zero;
40 getboottime(&boottime); 40 getboottime(&boottime);
41 jif = boottime.tv_sec; 41 jif = boottime.tv_sec;
42 42
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
51 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 51 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
52 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 52 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
53 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 53 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
54 guest_nice = cputime64_add(guest_nice,
55 kstat_cpu(i).cpustat.guest_nice);
54 for_each_irq_nr(j) { 56 for_each_irq_nr(j) {
55 sum += kstat_irqs_cpu(j, i); 57 sum += kstat_irqs_cpu(j, i);
56 } 58 }
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
65 } 67 }
66 sum += arch_irq_stat(); 68 sum += arch_irq_stat();
67 69
68 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 70 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
71 "%llu\n",
69 (unsigned long long)cputime64_to_clock_t(user), 72 (unsigned long long)cputime64_to_clock_t(user),
70 (unsigned long long)cputime64_to_clock_t(nice), 73 (unsigned long long)cputime64_to_clock_t(nice),
71 (unsigned long long)cputime64_to_clock_t(system), 74 (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
74 (unsigned long long)cputime64_to_clock_t(irq), 77 (unsigned long long)cputime64_to_clock_t(irq),
75 (unsigned long long)cputime64_to_clock_t(softirq), 78 (unsigned long long)cputime64_to_clock_t(softirq),
76 (unsigned long long)cputime64_to_clock_t(steal), 79 (unsigned long long)cputime64_to_clock_t(steal),
77 (unsigned long long)cputime64_to_clock_t(guest)); 80 (unsigned long long)cputime64_to_clock_t(guest),
81 (unsigned long long)cputime64_to_clock_t(guest_nice));
78 for_each_online_cpu(i) { 82 for_each_online_cpu(i) {
79 83
80 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 84 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
88 softirq = kstat_cpu(i).cpustat.softirq; 92 softirq = kstat_cpu(i).cpustat.softirq;
89 steal = kstat_cpu(i).cpustat.steal; 93 steal = kstat_cpu(i).cpustat.steal;
90 guest = kstat_cpu(i).cpustat.guest; 94 guest = kstat_cpu(i).cpustat.guest;
95 guest_nice = kstat_cpu(i).cpustat.guest_nice;
91 seq_printf(p, 96 seq_printf(p,
92 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 97 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
98 "%llu\n",
93 i, 99 i,
94 (unsigned long long)cputime64_to_clock_t(user), 100 (unsigned long long)cputime64_to_clock_t(user),
95 (unsigned long long)cputime64_to_clock_t(nice), 101 (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
99 (unsigned long long)cputime64_to_clock_t(irq), 105 (unsigned long long)cputime64_to_clock_t(irq),
100 (unsigned long long)cputime64_to_clock_t(softirq), 106 (unsigned long long)cputime64_to_clock_t(softirq),
101 (unsigned long long)cputime64_to_clock_t(steal), 107 (unsigned long long)cputime64_to_clock_t(steal),
102 (unsigned long long)cputime64_to_clock_t(guest)); 108 (unsigned long long)cputime64_to_clock_t(guest),
109 (unsigned long long)cputime64_to_clock_t(guest_nice));
103 } 110 }
104 seq_printf(p, "intr %llu", (unsigned long long)sum); 111 seq_printf(p, "intr %llu", (unsigned long long)sum);
105 112
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa8874b52..c059044bc6dc 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ struct cpu_usage_stat {
25 cputime64_t iowait; 25 cputime64_t iowait;
26 cputime64_t steal; 26 cputime64_t steal;
27 cputime64_t guest; 27 cputime64_t guest;
28 cputime64_t guest_nice;
28}; 29};
29 30
30struct kernel_stat { 31struct kernel_stat {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..78ba664474f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void);
145 145
146 146
147extern void calc_global_load(void); 147extern void calc_global_load(void);
148extern u64 cpu_nr_migrations(int cpu);
149 148
150extern unsigned long get_parent_ip(unsigned long addr); 149extern unsigned long get_parent_ip(unsigned long addr);
151 150
@@ -171,8 +170,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
171} 170}
172#endif 171#endif
173 172
174extern unsigned long long time_sync_thresh;
175
176/* 173/*
177 * Task state bitmask. NOTE! These bits are also 174 * Task state bitmask. NOTE! These bits are also
178 * encoded in fs/proc/array.c: get_task_state(). 175 * encoded in fs/proc/array.c: get_task_state().
@@ -349,7 +346,6 @@ extern signed long schedule_timeout(signed long timeout);
349extern signed long schedule_timeout_interruptible(signed long timeout); 346extern signed long schedule_timeout_interruptible(signed long timeout);
350extern signed long schedule_timeout_killable(signed long timeout); 347extern signed long schedule_timeout_killable(signed long timeout);
351extern signed long schedule_timeout_uninterruptible(signed long timeout); 348extern signed long schedule_timeout_uninterruptible(signed long timeout);
352asmlinkage void __schedule(void);
353asmlinkage void schedule(void); 349asmlinkage void schedule(void);
354extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); 350extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
355 351
@@ -1013,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
1013 return to_cpumask(sd->span); 1009 return to_cpumask(sd->span);
1014} 1010}
1015 1011
1016extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, 1012extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1017 struct sched_domain_attr *dattr_new); 1013 struct sched_domain_attr *dattr_new);
1018 1014
1015/* Allocate an array of sched domains, for partition_sched_domains(). */
1016cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
1017void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
1018
1019/* Test a flag in parent sched domain */ 1019/* Test a flag in parent sched domain */
1020static inline int test_sd_parent(struct sched_domain *sd, int flag) 1020static inline int test_sd_parent(struct sched_domain *sd, int flag)
1021{ 1021{
@@ -1033,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
1033struct sched_domain_attr; 1033struct sched_domain_attr;
1034 1034
1035static inline void 1035static inline void
1036partition_sched_domains(int ndoms_new, struct cpumask *doms_new, 1036partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1037 struct sched_domain_attr *dattr_new) 1037 struct sched_domain_attr *dattr_new)
1038{ 1038{
1039} 1039}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
537 * element of the partition (one sched domain) to be passed to 537 * element of the partition (one sched domain) to be passed to
538 * partition_sched_domains(). 538 * partition_sched_domains().
539 */ 539 */
540/* FIXME: see the FIXME in partition_sched_domains() */ 540static int generate_sched_domains(cpumask_var_t **domains,
541static int generate_sched_domains(struct cpumask **domains,
542 struct sched_domain_attr **attributes) 541 struct sched_domain_attr **attributes)
543{ 542{
544 LIST_HEAD(q); /* queue of cpusets to be scanned */ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
546 struct cpuset **csa; /* array of all cpuset ptrs */ 545 struct cpuset **csa; /* array of all cpuset ptrs */
547 int csn; /* how many cpuset ptrs in csa so far */ 546 int csn; /* how many cpuset ptrs in csa so far */
548 int i, j, k; /* indices for partition finding loops */ 547 int i, j, k; /* indices for partition finding loops */
549 struct cpumask *doms; /* resulting partition; i.e. sched domains */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
550 struct sched_domain_attr *dattr; /* attributes for custom domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */
551 int ndoms = 0; /* number of sched domains in result */ 550 int ndoms = 0; /* number of sched domains in result */
552 int nslot; /* next empty doms[] struct cpumask slot */ 551 int nslot; /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
557 556
558 /* Special case for the 99% of systems with one, full, sched domain */ 557 /* Special case for the 99% of systems with one, full, sched domain */
559 if (is_sched_load_balance(&top_cpuset)) { 558 if (is_sched_load_balance(&top_cpuset)) {
560 doms = kmalloc(cpumask_size(), GFP_KERNEL); 559 ndoms = 1;
560 doms = alloc_sched_domains(ndoms);
561 if (!doms) 561 if (!doms)
562 goto done; 562 goto done;
563 563
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
566 *dattr = SD_ATTR_INIT; 566 *dattr = SD_ATTR_INIT;
567 update_domain_attr_tree(dattr, &top_cpuset); 567 update_domain_attr_tree(dattr, &top_cpuset);
568 } 568 }
569 cpumask_copy(doms, top_cpuset.cpus_allowed); 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed);
570 570
571 ndoms = 1;
572 goto done; 571 goto done;
573 } 572 }
574 573
@@ -636,7 +635,7 @@ restart:
636 * Now we know how many domains to create. 635 * Now we know how many domains to create.
637 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
638 */ 637 */
639 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); 638 doms = alloc_sched_domains(ndoms);
640 if (!doms) 639 if (!doms)
641 goto done; 640 goto done;
642 641
@@ -656,7 +655,7 @@ restart:
656 continue; 655 continue;
657 } 656 }
658 657
659 dp = doms + nslot; 658 dp = doms[nslot];
660 659
661 if (nslot == ndoms) { 660 if (nslot == ndoms) {
662 static int warnings = 10; 661 static int warnings = 10;
@@ -718,7 +717,7 @@ done:
718static void do_rebuild_sched_domains(struct work_struct *unused) 717static void do_rebuild_sched_domains(struct work_struct *unused)
719{ 718{
720 struct sched_domain_attr *attr; 719 struct sched_domain_attr *attr;
721 struct cpumask *doms; 720 cpumask_var_t *doms;
722 int ndoms; 721 int ndoms;
723 722
724 get_online_cpus(); 723 get_online_cpus();
@@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2052 unsigned long phase, void *unused_cpu) 2051 unsigned long phase, void *unused_cpu)
2053{ 2052{
2054 struct sched_domain_attr *attr; 2053 struct sched_domain_attr *attr;
2055 struct cpumask *doms; 2054 cpumask_var_t *doms;
2056 int ndoms; 2055 int ndoms;
2057 2056
2058 switch (phase) { 2057 switch (phase) {
@@ -2537,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = {
2537}; 2536};
2538#endif /* CONFIG_PROC_PID_CPUSET */ 2537#endif /* CONFIG_PROC_PID_CPUSET */
2539 2538
2540/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2539/* Display task mems_allowed in /proc/<pid>/status file. */
2541void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2540void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2542{ 2541{
2543 seq_printf(m, "Cpus_allowed:\t");
2544 seq_cpumask(m, &task->cpus_allowed);
2545 seq_printf(m, "\n");
2546 seq_printf(m, "Cpus_allowed_list:\t");
2547 seq_cpumask_list(m, &task->cpus_allowed);
2548 seq_printf(m, "\n");
2549 seq_printf(m, "Mems_allowed:\t"); 2542 seq_printf(m, "Mems_allowed:\t");
2550 seq_nodemask(m, &task->mems_allowed); 2543 seq_nodemask(m, &task->mems_allowed);
2551 seq_printf(m, "\n"); 2544 seq_printf(m, "\n");
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
870 870
871 /* 871 /*
872 * All threads that don't have debuggerinfo should be 872 * All threads that don't have debuggerinfo should be
873 * in __schedule() sleeping, since all other CPUs 873 * in schedule() sleeping, since all other CPUs
874 * are in kgdb_wait, and thus have debuggerinfo. 874 * are in kgdb_wait, and thus have debuggerinfo.
875 */ 875 */
876 if (local_debuggerinfo) { 876 if (local_debuggerinfo) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c91f110fc62..315ba4059f93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -535,14 +535,12 @@ struct rq {
535 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
540#endif 539#endif
541 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
542 struct load_weight load; 541 struct load_weight load;
543 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
544 u64 nr_switches; 543 u64 nr_switches;
545 u64 nr_migrations_in;
546 544
547 struct cfs_rq cfs; 545 struct cfs_rq cfs;
548 struct rt_rq rt; 546 struct rt_rq rt;
@@ -591,6 +589,8 @@ struct rq {
591 589
592 u64 rt_avg; 590 u64 rt_avg;
593 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
594#endif 594#endif
595 595
596 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
772 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
773 return -EINVAL; 773 return -EINVAL;
774 774
775 filp->f_pos += cnt; 775 *ppos += cnt;
776 776
777 return cnt; 777 return cnt;
778} 778}
@@ -2079,7 +2079,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2079#endif 2079#endif
2080 if (old_cpu != new_cpu) { 2080 if (old_cpu != new_cpu) {
2081 p->se.nr_migrations++; 2081 p->se.nr_migrations++;
2082 new_rq->nr_migrations_in++;
2083#ifdef CONFIG_SCHEDSTATS 2082#ifdef CONFIG_SCHEDSTATS
2084 if (task_hot(p, old_rq->clock, NULL)) 2083 if (task_hot(p, old_rq->clock, NULL))
2085 schedstat_inc(p, se.nr_forced2_migrations); 2084 schedstat_inc(p, se.nr_forced2_migrations);
@@ -2443,6 +2442,17 @@ out_running:
2443#ifdef CONFIG_SMP 2442#ifdef CONFIG_SMP
2444 if (p->sched_class->task_wake_up) 2443 if (p->sched_class->task_wake_up)
2445 p->sched_class->task_wake_up(rq, p); 2444 p->sched_class->task_wake_up(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2446#endif 2456#endif
2447out: 2457out:
2448 task_rq_unlock(rq, &flags); 2458 task_rq_unlock(rq, &flags);
@@ -2855,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2855 */ 2865 */
2856 arch_start_context_switch(prev); 2866 arch_start_context_switch(prev);
2857 2867
2858 if (unlikely(!mm)) { 2868 if (likely(!mm)) {
2859 next->active_mm = oldmm; 2869 next->active_mm = oldmm;
2860 atomic_inc(&oldmm->mm_count); 2870 atomic_inc(&oldmm->mm_count);
2861 enter_lazy_tlb(oldmm, next); 2871 enter_lazy_tlb(oldmm, next);
2862 } else 2872 } else
2863 switch_mm(oldmm, mm, next); 2873 switch_mm(oldmm, mm, next);
2864 2874
2865 if (unlikely(!prev->mm)) { 2875 if (likely(!prev->mm)) {
2866 prev->active_mm = NULL; 2876 prev->active_mm = NULL;
2867 rq->prev_mm = oldmm; 2877 rq->prev_mm = oldmm;
2868 } 2878 }
@@ -3025,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq)
3025} 3035}
3026 3036
3027/* 3037/*
3028 * Externally visible per-cpu scheduler statistics:
3029 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3030 */
3031u64 cpu_nr_migrations(int cpu)
3032{
3033 return cpu_rq(cpu)->nr_migrations_in;
3034}
3035
3036/*
3037 * Update rq->cpu_load[] statistics. This function is usually called every 3038 * Update rq->cpu_load[] statistics. This function is usually called every
3038 * scheduler tick (TICK_NSEC). 3039 * scheduler tick (TICK_NSEC).
3039 */ 3040 */
@@ -4133,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4133 unsigned long flags; 4134 unsigned long flags;
4134 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4135 4136
4136 cpumask_setall(cpus); 4137 cpumask_copy(cpus, cpu_online_mask);
4137 4138
4138 /* 4139 /*
4139 * When power savings policy is enabled for the parent domain, idle 4140 * When power savings policy is enabled for the parent domain, idle
@@ -4296,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4296 int all_pinned = 0; 4297 int all_pinned = 0;
4297 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4298 4299
4299 cpumask_setall(cpus); 4300 cpumask_copy(cpus, cpu_online_mask);
4300 4301
4301 /* 4302 /*
4302 * When power savings policy is enabled for the parent domain, idle 4303 * When power savings policy is enabled for the parent domain, idle
@@ -4436,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4436 int pulled_task = 0; 4437 int pulled_task = 0;
4437 unsigned long next_balance = jiffies + HZ; 4438 unsigned long next_balance = jiffies + HZ;
4438 4439
4440 this_rq->idle_stamp = this_rq->clock;
4441
4442 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4443 return;
4444
4439 for_each_domain(this_cpu, sd) { 4445 for_each_domain(this_cpu, sd) {
4440 unsigned long interval; 4446 unsigned long interval;
4441 4447
@@ -4450,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4450 interval = msecs_to_jiffies(sd->balance_interval); 4456 interval = msecs_to_jiffies(sd->balance_interval);
4451 if (time_after(next_balance, sd->last_balance + interval)) 4457 if (time_after(next_balance, sd->last_balance + interval))
4452 next_balance = sd->last_balance + interval; 4458 next_balance = sd->last_balance + interval;
4453 if (pulled_task) 4459 if (pulled_task) {
4460 this_rq->idle_stamp = 0;
4454 break; 4461 break;
4462 }
4455 } 4463 }
4456 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4464 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4457 /* 4465 /*
@@ -5053,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5053 p->gtime = cputime_add(p->gtime, cputime); 5061 p->gtime = cputime_add(p->gtime, cputime);
5054 5062
5055 /* Add guest time to cpustat. */ 5063 /* Add guest time to cpustat. */
5056 cpustat->user = cputime64_add(cpustat->user, tmp); 5064 if (TASK_NICE(p) > 0) {
5057 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5065 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5066 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5067 } else {
5068 cpustat->user = cputime64_add(cpustat->user, tmp);
5069 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5070 }
5058} 5071}
5059 5072
5060/* 5073/*
@@ -5179,41 +5192,45 @@ cputime_t task_stime(struct task_struct *p)
5179 return p->stime; 5192 return p->stime;
5180} 5193}
5181#else 5194#else
5195
5196#ifndef nsecs_to_cputime
5197# define nsecs_to_cputime(__nsecs) \
5198 msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
5199#endif
5200
5182cputime_t task_utime(struct task_struct *p) 5201cputime_t task_utime(struct task_struct *p)
5183{ 5202{
5184 clock_t utime = cputime_to_clock_t(p->utime), 5203 cputime_t utime = p->utime, total = utime + p->stime;
5185 total = utime + cputime_to_clock_t(p->stime);
5186 u64 temp; 5204 u64 temp;
5187 5205
5188 /* 5206 /*
5189 * Use CFS's precise accounting: 5207 * Use CFS's precise accounting:
5190 */ 5208 */
5191 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5209 temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
5192 5210
5193 if (total) { 5211 if (total) {
5194 temp *= utime; 5212 temp *= utime;
5195 do_div(temp, total); 5213 do_div(temp, total);
5196 } 5214 }
5197 utime = (clock_t)temp; 5215 utime = (cputime_t)temp;
5198 5216
5199 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5217 p->prev_utime = max(p->prev_utime, utime);
5200 return p->prev_utime; 5218 return p->prev_utime;
5201} 5219}
5202 5220
5203cputime_t task_stime(struct task_struct *p) 5221cputime_t task_stime(struct task_struct *p)
5204{ 5222{
5205 clock_t stime; 5223 cputime_t stime;
5206 5224
5207 /* 5225 /*
5208 * Use CFS's precise accounting. (we subtract utime from 5226 * Use CFS's precise accounting. (we subtract utime from
5209 * the total, to make sure the total observed by userspace 5227 * the total, to make sure the total observed by userspace
5210 * grows monotonically - apps rely on that): 5228 * grows monotonically - apps rely on that):
5211 */ 5229 */
5212 stime = nsec_to_clock_t(p->se.sum_exec_runtime) - 5230 stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
5213 cputime_to_clock_t(task_utime(p));
5214 5231
5215 if (stime >= 0) 5232 if (stime >= 0)
5216 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5233 p->prev_stime = max(p->prev_stime, stime);
5217 5234
5218 return p->prev_stime; 5235 return p->prev_stime;
5219} 5236}
@@ -6182,22 +6199,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6182 BUG_ON(p->se.on_rq); 6199 BUG_ON(p->se.on_rq);
6183 6200
6184 p->policy = policy; 6201 p->policy = policy;
6185 switch (p->policy) {
6186 case SCHED_NORMAL:
6187 case SCHED_BATCH:
6188 case SCHED_IDLE:
6189 p->sched_class = &fair_sched_class;
6190 break;
6191 case SCHED_FIFO:
6192 case SCHED_RR:
6193 p->sched_class = &rt_sched_class;
6194 break;
6195 }
6196
6197 p->rt_priority = prio; 6202 p->rt_priority = prio;
6198 p->normal_prio = normal_prio(p); 6203 p->normal_prio = normal_prio(p);
6199 /* we are holding p->pi_lock already */ 6204 /* we are holding p->pi_lock already */
6200 p->prio = rt_mutex_getprio(p); 6205 p->prio = rt_mutex_getprio(p);
6206 if (rt_prio(p->prio))
6207 p->sched_class = &rt_sched_class;
6208 else
6209 p->sched_class = &fair_sched_class;
6201 set_load_weight(p); 6210 set_load_weight(p);
6202} 6211}
6203 6212
@@ -6942,7 +6951,7 @@ void show_state_filter(unsigned long state_filter)
6942 /* 6951 /*
6943 * Only show locks if all tasks are dumped: 6952 * Only show locks if all tasks are dumped:
6944 */ 6953 */
6945 if (state_filter == -1) 6954 if (!state_filter)
6946 debug_show_all_locks(); 6955 debug_show_all_locks();
6947} 6956}
6948 6957
@@ -7747,6 +7756,16 @@ early_initcall(migration_init);
7747 7756
7748#ifdef CONFIG_SCHED_DEBUG 7757#ifdef CONFIG_SCHED_DEBUG
7749 7758
7759static __read_mostly int sched_domain_debug_enabled;
7760
7761static int __init sched_domain_debug_setup(char *str)
7762{
7763 sched_domain_debug_enabled = 1;
7764
7765 return 0;
7766}
7767early_param("sched_debug", sched_domain_debug_setup);
7768
7750static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7769static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7751 struct cpumask *groupmask) 7770 struct cpumask *groupmask)
7752{ 7771{
@@ -7833,6 +7852,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7833 cpumask_var_t groupmask; 7852 cpumask_var_t groupmask;
7834 int level = 0; 7853 int level = 0;
7835 7854
7855 if (!sched_domain_debug_enabled)
7856 return;
7857
7836 if (!sd) { 7858 if (!sd) {
7837 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7859 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7838 return; 7860 return;
@@ -8890,7 +8912,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8890 return __build_sched_domains(cpu_map, NULL); 8912 return __build_sched_domains(cpu_map, NULL);
8891} 8913}
8892 8914
8893static struct cpumask *doms_cur; /* current sched domains */ 8915static cpumask_var_t *doms_cur; /* current sched domains */
8894static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8916static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8895static struct sched_domain_attr *dattr_cur; 8917static struct sched_domain_attr *dattr_cur;
8896 /* attribues of custom domains in 'doms_cur' */ 8918 /* attribues of custom domains in 'doms_cur' */
@@ -8912,6 +8934,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8912 return 0; 8934 return 0;
8913} 8935}
8914 8936
8937cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8938{
8939 int i;
8940 cpumask_var_t *doms;
8941
8942 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8943 if (!doms)
8944 return NULL;
8945 for (i = 0; i < ndoms; i++) {
8946 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8947 free_sched_domains(doms, i);
8948 return NULL;
8949 }
8950 }
8951 return doms;
8952}
8953
8954void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8955{
8956 unsigned int i;
8957 for (i = 0; i < ndoms; i++)
8958 free_cpumask_var(doms[i]);
8959 kfree(doms);
8960}
8961
8915/* 8962/*
8916 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8963 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8917 * For now this just excludes isolated cpus, but could be used to 8964 * For now this just excludes isolated cpus, but could be used to
@@ -8923,12 +8970,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8923 8970
8924 arch_update_cpu_topology(); 8971 arch_update_cpu_topology();
8925 ndoms_cur = 1; 8972 ndoms_cur = 1;
8926 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8973 doms_cur = alloc_sched_domains(ndoms_cur);
8927 if (!doms_cur) 8974 if (!doms_cur)
8928 doms_cur = fallback_doms; 8975 doms_cur = &fallback_doms;
8929 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 8976 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8930 dattr_cur = NULL; 8977 dattr_cur = NULL;
8931 err = build_sched_domains(doms_cur); 8978 err = build_sched_domains(doms_cur[0]);
8932 register_sched_domain_sysctl(); 8979 register_sched_domain_sysctl();
8933 8980
8934 return err; 8981 return err;
@@ -8978,19 +9025,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8978 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9025 * doms_new[] to the current sched domain partitioning, doms_cur[].
8979 * It destroys each deleted domain and builds each new domain. 9026 * It destroys each deleted domain and builds each new domain.
8980 * 9027 *
8981 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9028 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8982 * The masks don't intersect (don't overlap.) We should setup one 9029 * The masks don't intersect (don't overlap.) We should setup one
8983 * sched domain for each mask. CPUs not in any of the cpumasks will 9030 * sched domain for each mask. CPUs not in any of the cpumasks will
8984 * not be load balanced. If the same cpumask appears both in the 9031 * not be load balanced. If the same cpumask appears both in the
8985 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9032 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8986 * it as it is. 9033 * it as it is.
8987 * 9034 *
8988 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9035 * The passed in 'doms_new' should be allocated using
8989 * ownership of it and will kfree it when done with it. If the caller 9036 * alloc_sched_domains. This routine takes ownership of it and will
8990 * failed the kmalloc call, then it can pass in doms_new == NULL && 9037 * free_sched_domains it when done with it. If the caller failed the
8991 * ndoms_new == 1, and partition_sched_domains() will fallback to 9038 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8992 * the single partition 'fallback_doms', it also forces the domains 9039 * and partition_sched_domains() will fallback to the single partition
8993 * to be rebuilt. 9040 * 'fallback_doms', it also forces the domains to be rebuilt.
8994 * 9041 *
8995 * If doms_new == NULL it will be replaced with cpu_online_mask. 9042 * If doms_new == NULL it will be replaced with cpu_online_mask.
8996 * ndoms_new == 0 is a special case for destroying existing domains, 9043 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8998,8 +9045,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8998 * 9045 *
8999 * Call with hotplug lock held 9046 * Call with hotplug lock held
9000 */ 9047 */
9001/* FIXME: Change to struct cpumask *doms_new[] */ 9048void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
9002void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9003 struct sched_domain_attr *dattr_new) 9049 struct sched_domain_attr *dattr_new)
9004{ 9050{
9005 int i, j, n; 9051 int i, j, n;
@@ -9018,40 +9064,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9018 /* Destroy deleted domains */ 9064 /* Destroy deleted domains */
9019 for (i = 0; i < ndoms_cur; i++) { 9065 for (i = 0; i < ndoms_cur; i++) {
9020 for (j = 0; j < n && !new_topology; j++) { 9066 for (j = 0; j < n && !new_topology; j++) {
9021 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9067 if (cpumask_equal(doms_cur[i], doms_new[j])
9022 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9068 && dattrs_equal(dattr_cur, i, dattr_new, j))
9023 goto match1; 9069 goto match1;
9024 } 9070 }
9025 /* no match - a current sched domain not in new doms_new[] */ 9071 /* no match - a current sched domain not in new doms_new[] */
9026 detach_destroy_domains(doms_cur + i); 9072 detach_destroy_domains(doms_cur[i]);
9027match1: 9073match1:
9028 ; 9074 ;
9029 } 9075 }
9030 9076
9031 if (doms_new == NULL) { 9077 if (doms_new == NULL) {
9032 ndoms_cur = 0; 9078 ndoms_cur = 0;
9033 doms_new = fallback_doms; 9079 doms_new = &fallback_doms;
9034 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9080 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
9035 WARN_ON_ONCE(dattr_new); 9081 WARN_ON_ONCE(dattr_new);
9036 } 9082 }
9037 9083
9038 /* Build new domains */ 9084 /* Build new domains */
9039 for (i = 0; i < ndoms_new; i++) { 9085 for (i = 0; i < ndoms_new; i++) {
9040 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9086 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9041 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9087 if (cpumask_equal(doms_new[i], doms_cur[j])
9042 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9088 && dattrs_equal(dattr_new, i, dattr_cur, j))
9043 goto match2; 9089 goto match2;
9044 } 9090 }
9045 /* no match - add a new doms_new */ 9091 /* no match - add a new doms_new */
9046 __build_sched_domains(doms_new + i, 9092 __build_sched_domains(doms_new[i],
9047 dattr_new ? dattr_new + i : NULL); 9093 dattr_new ? dattr_new + i : NULL);
9048match2: 9094match2:
9049 ; 9095 ;
9050 } 9096 }
9051 9097
9052 /* Remember the new sched domains */ 9098 /* Remember the new sched domains */
9053 if (doms_cur != fallback_doms) 9099 if (doms_cur != &fallback_doms)
9054 kfree(doms_cur); 9100 free_sched_domains(doms_cur, ndoms_cur);
9055 kfree(dattr_cur); /* kfree(NULL) is safe */ 9101 kfree(dattr_cur); /* kfree(NULL) is safe */
9056 doms_cur = doms_new; 9102 doms_cur = doms_new;
9057 dattr_cur = dattr_new; 9103 dattr_cur = dattr_new;
@@ -9373,10 +9419,6 @@ void __init sched_init(void)
9373#ifdef CONFIG_CPUMASK_OFFSTACK 9419#ifdef CONFIG_CPUMASK_OFFSTACK
9374 alloc_size += num_possible_cpus() * cpumask_size(); 9420 alloc_size += num_possible_cpus() * cpumask_size();
9375#endif 9421#endif
9376 /*
9377 * As sched_init() is called before page_alloc is setup,
9378 * we use alloc_bootmem().
9379 */
9380 if (alloc_size) { 9422 if (alloc_size) {
9381 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9423 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9382 9424
@@ -9531,6 +9573,8 @@ void __init sched_init(void)
9531 rq->cpu = i; 9573 rq->cpu = i;
9532 rq->online = 0; 9574 rq->online = 0;
9533 rq->migration_thread = NULL; 9575 rq->migration_thread = NULL;
9576 rq->idle_stamp = 0;
9577 rq->avg_idle = 2*sysctl_sched_migration_cost;
9534 INIT_LIST_HEAD(&rq->migration_queue); 9578 INIT_LIST_HEAD(&rq->migration_queue);
9535 rq_attach_root(rq, &def_root_domain); 9579 rq_attach_root(rq, &def_root_domain);
9536#endif 9580#endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..6988cf08f705 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
285 285
286#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
288#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
288 289
289 P(yld_count); 290 P(yld_count);
290 291
291 P(sched_switch); 292 P(sched_switch);
292 P(sched_count); 293 P(sched_count);
293 P(sched_goidle); 294 P(sched_goidle);
295#ifdef CONFIG_SMP
296 P64(avg_idle);
297#endif
294 298
295 P(ttwu_count); 299 P(ttwu_count);
296 P(ttwu_local); 300 P(ttwu_local);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..f61837ad336d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1345,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1345} 1345}
1346 1346
1347/* 1347/*
1348 * Try and locate an idle CPU in the sched_domain.
1349 */
1350static int
1351select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1352{
1353 int cpu = smp_processor_id();
1354 int prev_cpu = task_cpu(p);
1355 int i;
1356
1357 /*
1358 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
1359 * test in select_task_rq_fair) and the prev_cpu is idle then that's
1360 * always a better target than the current cpu.
1361 */
1362 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
1363 return prev_cpu;
1364
1365 /*
1366 * Otherwise, iterate the domain and find an elegible idle cpu.
1367 */
1368 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1369 if (!cpu_rq(i)->cfs.nr_running) {
1370 target = i;
1371 break;
1372 }
1373 }
1374
1375 return target;
1376}
1377
1378/*
1348 * sched_balance_self: balance the current task (running on cpu) in domains 1379 * sched_balance_self: balance the current task (running on cpu) in domains
1349 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1380 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350 * SD_BALANCE_EXEC. 1381 * SD_BALANCE_EXEC.
@@ -1398,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398 want_sd = 0; 1429 want_sd = 0;
1399 } 1430 }
1400 1431
1401 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1432 /*
1402 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1433 * While iterating the domains looking for a spanning
1434 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
1435 * in cache sharing domains along the way.
1436 */
1437 if (want_affine) {
1438 int target = -1;
1403 1439
1404 affine_sd = tmp; 1440 /*
1405 want_affine = 0; 1441 * If both cpu and prev_cpu are part of this domain,
1442 * cpu is a valid SD_WAKE_AFFINE target.
1443 */
1444 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1445 target = cpu;
1446
1447 /*
1448 * If there's an idle sibling in this domain, make that
1449 * the wake_affine target instead of the current cpu.
1450 */
1451 if (tmp->flags & SD_PREFER_SIBLING)
1452 target = select_idle_sibling(p, tmp, target);
1453
1454 if (target >= 0) {
1455 if (tmp->flags & SD_WAKE_AFFINE) {
1456 affine_sd = tmp;
1457 want_affine = 0;
1458 }
1459 cpu = target;
1460 }
1406 } 1461 }
1407 1462
1408 if (!want_sd && !want_affine) 1463 if (!want_sd && !want_affine)
@@ -1679,7 +1734,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1679 struct cfs_rq *cfs_rq = &rq->cfs; 1734 struct cfs_rq *cfs_rq = &rq->cfs;
1680 struct sched_entity *se; 1735 struct sched_entity *se;
1681 1736
1682 if (unlikely(!cfs_rq->nr_running)) 1737 if (!cfs_rq->nr_running)
1683 return NULL; 1738 return NULL;
1684 1739
1685 do { 1740 do {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..5c5fef378415 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1153 1153
1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1154static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1155 1155
1156static inline int pick_optimal_cpu(int this_cpu,
1157 const struct cpumask *mask)
1158{
1159 int first;
1160
1161 /* "this_cpu" is cheaper to preempt than a remote processor */
1162 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
1163 return this_cpu;
1164
1165 first = cpumask_first(mask);
1166 if (first < nr_cpu_ids)
1167 return first;
1168
1169 return -1;
1170}
1171
1172static int find_lowest_rq(struct task_struct *task) 1156static int find_lowest_rq(struct task_struct *task)
1173{ 1157{
1174 struct sched_domain *sd; 1158 struct sched_domain *sd;
1175 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1159 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1176 int this_cpu = smp_processor_id(); 1160 int this_cpu = smp_processor_id();
1177 int cpu = task_cpu(task); 1161 int cpu = task_cpu(task);
1178 cpumask_var_t domain_mask;
1179 1162
1180 if (task->rt.nr_cpus_allowed == 1) 1163 if (task->rt.nr_cpus_allowed == 1)
1181 return -1; /* No other targets possible */ 1164 return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
1198 * Otherwise, we consult the sched_domains span maps to figure 1181 * Otherwise, we consult the sched_domains span maps to figure
1199 * out which cpu is logically closest to our hot cache data. 1182 * out which cpu is logically closest to our hot cache data.
1200 */ 1183 */
1201 if (this_cpu == cpu) 1184 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1202 this_cpu = -1; /* Skip this_cpu opt if the same */ 1185 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1203
1204 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1205 for_each_domain(cpu, sd) {
1206 if (sd->flags & SD_WAKE_AFFINE) {
1207 int best_cpu;
1208 1186
1209 cpumask_and(domain_mask, 1187 for_each_domain(cpu, sd) {
1210 sched_domain_span(sd), 1188 if (sd->flags & SD_WAKE_AFFINE) {
1211 lowest_mask); 1189 int best_cpu;
1212 1190
1213 best_cpu = pick_optimal_cpu(this_cpu, 1191 /*
1214 domain_mask); 1192 * "this_cpu" is cheaper to preempt than a
1215 1193 * remote processor.
1216 if (best_cpu != -1) { 1194 */
1217 free_cpumask_var(domain_mask); 1195 if (this_cpu != -1 &&
1218 return best_cpu; 1196 cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
1219 } 1197 return this_cpu;
1220 } 1198
1199 best_cpu = cpumask_first_and(lowest_mask,
1200 sched_domain_span(sd));
1201 if (best_cpu < nr_cpu_ids)
1202 return best_cpu;
1221 } 1203 }
1222 free_cpumask_var(domain_mask);
1223 } 1204 }
1224 1205
1225 /* 1206 /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
1227 * just give the caller *something* to work with from the compatible 1208 * just give the caller *something* to work with from the compatible
1228 * locations. 1209 * locations.
1229 */ 1210 */
1230 return pick_optimal_cpu(this_cpu, lowest_mask); 1211 if (this_cpu != -1)
1212 return this_cpu;
1213
1214 cpu = cpumask_any(lowest_mask);
1215 if (cpu < nr_cpu_ids)
1216 return cpu;
1217 return -1;
1231} 1218}
1232 1219
1233/* Will lock the rq it finds */ 1220/* Will lock the rq it finds */