aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-18 00:00:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-18 00:00:02 -0400
commitdcbf77b9e86e1726f5fbd01bb98820dac06d456e (patch)
tree2f0b728ce70c03e1d0e3461e8a3c3d1fbe68fb90
parentca043a66ae48c74fa628ec92178f7a54f5b9a106 (diff)
parent29cd8bae396583a2ee9a3340db8c5102acf9f6fd (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (37 commits) sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE sched: Stop buddies from hogging the system sched: Add new wakeup preemption mode: WAKEUP_RUNNING sched: Fix TASK_WAKING & loadaverage breakage sched: Disable wakeup balancing sched: Rename flags to wake_flags sched: Clean up the load_idx selection in select_task_rq_fair sched: Optimize cgroup vs wakeup a bit sched: x86: Name old_perf in a unique way sched: Implement a gentler fair-sleepers feature sched: Add SD_PREFER_LOCAL sched: Add a few SYNC hint knobs to play with sched: Fix sync wakeups again sched: Add WF_FORK sched: Rename sync arguments sched: Rename select_task_rq() argument sched: Feature to disable APERF/MPERF cpu_power x86: sched: Provide arch implementations using aperf/mperf x86: Add generic aperf/mperf code x86: Move APERF/MPERF into a X86_FEATURE ... Fix up trivial conflict in arch/x86/include/asm/processor.h due to nearby addition of amd_get_nb_id() declaration from the EDAC merge.
-rw-r--r--arch/ia64/include/asm/topology.h17
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h1
-rw-r--r--arch/powerpc/include/asm/topology.h9
-rw-r--r--arch/sh/include/asm/topology.h10
-rw-r--r--arch/sparc/include/asm/topology_64.h7
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/processor.h30
-rw-r--r--arch/x86/include/asm/topology.h14
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c88
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/topology.h32
-rw-r--r--include/linux/wait.h4
-rw-r--r--kernel/sched.c444
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c414
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c7
21 files changed, 688 insertions, 603 deletions
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d1..d0141fbf51d0 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -61,12 +61,13 @@ void build_cpu_to_node_map(void);
61 .cache_nice_tries = 2, \ 61 .cache_nice_tries = 2, \
62 .busy_idx = 2, \ 62 .busy_idx = 2, \
63 .idle_idx = 1, \ 63 .idle_idx = 1, \
64 .newidle_idx = 2, \ 64 .newidle_idx = 0, \
65 .wake_idx = 1, \ 65 .wake_idx = 0, \
66 .forkexec_idx = 1, \ 66 .forkexec_idx = 0, \
67 .flags = SD_LOAD_BALANCE \ 67 .flags = SD_LOAD_BALANCE \
68 | SD_BALANCE_NEWIDLE \ 68 | SD_BALANCE_NEWIDLE \
69 | SD_BALANCE_EXEC \ 69 | SD_BALANCE_EXEC \
70 | SD_BALANCE_FORK \
70 | SD_WAKE_AFFINE, \ 71 | SD_WAKE_AFFINE, \
71 .last_balance = jiffies, \ 72 .last_balance = jiffies, \
72 .balance_interval = 1, \ 73 .balance_interval = 1, \
@@ -85,14 +86,14 @@ void build_cpu_to_node_map(void);
85 .cache_nice_tries = 2, \ 86 .cache_nice_tries = 2, \
86 .busy_idx = 3, \ 87 .busy_idx = 3, \
87 .idle_idx = 2, \ 88 .idle_idx = 2, \
88 .newidle_idx = 2, \ 89 .newidle_idx = 0, \
89 .wake_idx = 1, \ 90 .wake_idx = 0, \
90 .forkexec_idx = 1, \ 91 .forkexec_idx = 0, \
91 .flags = SD_LOAD_BALANCE \ 92 .flags = SD_LOAD_BALANCE \
93 | SD_BALANCE_NEWIDLE \
92 | SD_BALANCE_EXEC \ 94 | SD_BALANCE_EXEC \
93 | SD_BALANCE_FORK \ 95 | SD_BALANCE_FORK \
94 | SD_SERIALIZE \ 96 | SD_SERIALIZE, \
95 | SD_WAKE_BALANCE, \
96 .last_balance = jiffies, \ 97 .last_balance = jiffies, \
97 .balance_interval = 64, \ 98 .balance_interval = 64, \
98 .nr_balance_failed = 0, \ 99 .nr_balance_failed = 0, \
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 07547231e078..230591707005 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
48 .cache_nice_tries = 1, \ 48 .cache_nice_tries = 1, \
49 .flags = SD_LOAD_BALANCE \ 49 .flags = SD_LOAD_BALANCE \
50 | SD_BALANCE_EXEC \ 50 | SD_BALANCE_EXEC \
51 | SD_WAKE_BALANCE, \
52 .last_balance = jiffies, \ 51 .last_balance = jiffies, \
53 .balance_interval = 1, \ 52 .balance_interval = 1, \
54 .nr_balance_failed = 0, \ 53 .nr_balance_failed = 0, \
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 054a16d68082..394edcbcce71 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -57,14 +57,13 @@ static inline int pcibus_to_node(struct pci_bus *bus)
57 .cache_nice_tries = 1, \ 57 .cache_nice_tries = 1, \
58 .busy_idx = 3, \ 58 .busy_idx = 3, \
59 .idle_idx = 1, \ 59 .idle_idx = 1, \
60 .newidle_idx = 2, \ 60 .newidle_idx = 0, \
61 .wake_idx = 1, \ 61 .wake_idx = 0, \
62 .flags = SD_LOAD_BALANCE \ 62 .flags = SD_LOAD_BALANCE \
63 | SD_BALANCE_EXEC \ 63 | SD_BALANCE_EXEC \
64 | SD_BALANCE_FORK \
64 | SD_BALANCE_NEWIDLE \ 65 | SD_BALANCE_NEWIDLE \
65 | SD_WAKE_IDLE \ 66 | SD_SERIALIZE, \
66 | SD_SERIALIZE \
67 | SD_WAKE_BALANCE, \
68 .last_balance = jiffies, \ 67 .last_balance = jiffies, \
69 .balance_interval = 1, \ 68 .balance_interval = 1, \
70 .nr_balance_failed = 0, \ 69 .nr_balance_failed = 0, \
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index b69ee850906d..f8c40cc65054 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -15,14 +15,14 @@
15 .cache_nice_tries = 2, \ 15 .cache_nice_tries = 2, \
16 .busy_idx = 3, \ 16 .busy_idx = 3, \
17 .idle_idx = 2, \ 17 .idle_idx = 2, \
18 .newidle_idx = 2, \ 18 .newidle_idx = 0, \
19 .wake_idx = 1, \ 19 .wake_idx = 0, \
20 .forkexec_idx = 1, \ 20 .forkexec_idx = 0, \
21 .flags = SD_LOAD_BALANCE \ 21 .flags = SD_LOAD_BALANCE \
22 | SD_BALANCE_FORK \ 22 | SD_BALANCE_FORK \
23 | SD_BALANCE_EXEC \ 23 | SD_BALANCE_EXEC \
24 | SD_SERIALIZE \ 24 | SD_BALANCE_NEWIDLE \
25 | SD_WAKE_BALANCE, \ 25 | SD_SERIALIZE, \
26 .last_balance = jiffies, \ 26 .last_balance = jiffies, \
27 .balance_interval = 1, \ 27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \ 28 .nr_balance_failed = 0, \
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index e5ea8d332421..26cd25c08399 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -52,13 +52,12 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
52 .busy_idx = 3, \ 52 .busy_idx = 3, \
53 .idle_idx = 2, \ 53 .idle_idx = 2, \
54 .newidle_idx = 0, \ 54 .newidle_idx = 0, \
55 .wake_idx = 1, \ 55 .wake_idx = 0, \
56 .forkexec_idx = 1, \ 56 .forkexec_idx = 0, \
57 .flags = SD_LOAD_BALANCE \ 57 .flags = SD_LOAD_BALANCE \
58 | SD_BALANCE_FORK \ 58 | SD_BALANCE_FORK \
59 | SD_BALANCE_EXEC \ 59 | SD_BALANCE_EXEC \
60 | SD_SERIALIZE \ 60 | SD_SERIALIZE, \
61 | SD_WAKE_BALANCE, \
62 .last_balance = jiffies, \ 61 .last_balance = jiffies, \
63 .balance_interval = 1, \ 62 .balance_interval = 1, \
64} 63}
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 847fee6493a2..9cfc88b97742 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,6 +96,7 @@
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ 97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
98#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ 98#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
99#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
99 100
100/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 101/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
101#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 102#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 42a3f936dadc..c3429e8b2424 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct mm_struct;
27#include <linux/cpumask.h> 27#include <linux/cpumask.h>
28#include <linux/cache.h> 28#include <linux/cache.h>
29#include <linux/threads.h> 29#include <linux/threads.h>
30#include <linux/math64.h>
30#include <linux/init.h> 31#include <linux/init.h>
31 32
32/* 33/*
@@ -1022,4 +1023,33 @@ extern int set_tsc_mode(unsigned int val);
1022 1023
1023extern int amd_get_nb_id(int cpu); 1024extern int amd_get_nb_id(int cpu);
1024 1025
1026struct aperfmperf {
1027 u64 aperf, mperf;
1028};
1029
1030static inline void get_aperfmperf(struct aperfmperf *am)
1031{
1032 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
1033
1034 rdmsrl(MSR_IA32_APERF, am->aperf);
1035 rdmsrl(MSR_IA32_MPERF, am->mperf);
1036}
1037
1038#define APERFMPERF_SHIFT 10
1039
1040static inline
1041unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
1042 struct aperfmperf *new)
1043{
1044 u64 aperf = new->aperf - old->aperf;
1045 u64 mperf = new->mperf - old->mperf;
1046 unsigned long ratio = aperf;
1047
1048 mperf >>= APERFMPERF_SHIFT;
1049 if (mperf)
1050 ratio = div64_u64(aperf, mperf);
1051
1052 return ratio;
1053}
1054
1025#endif /* _ASM_X86_PROCESSOR_H */ 1055#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a18..6f0695d744bf 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,15 +116,11 @@ extern unsigned long node_remap_size[];
116 116
117# define SD_CACHE_NICE_TRIES 1 117# define SD_CACHE_NICE_TRIES 1
118# define SD_IDLE_IDX 1 118# define SD_IDLE_IDX 1
119# define SD_NEWIDLE_IDX 2
120# define SD_FORKEXEC_IDX 0
121 119
122#else 120#else
123 121
124# define SD_CACHE_NICE_TRIES 2 122# define SD_CACHE_NICE_TRIES 2
125# define SD_IDLE_IDX 2 123# define SD_IDLE_IDX 2
126# define SD_NEWIDLE_IDX 2
127# define SD_FORKEXEC_IDX 1
128 124
129#endif 125#endif
130 126
@@ -137,22 +133,20 @@ extern unsigned long node_remap_size[];
137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 133 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
138 .busy_idx = 3, \ 134 .busy_idx = 3, \
139 .idle_idx = SD_IDLE_IDX, \ 135 .idle_idx = SD_IDLE_IDX, \
140 .newidle_idx = SD_NEWIDLE_IDX, \ 136 .newidle_idx = 0, \
141 .wake_idx = 1, \ 137 .wake_idx = 0, \
142 .forkexec_idx = SD_FORKEXEC_IDX, \ 138 .forkexec_idx = 0, \
143 \ 139 \
144 .flags = 1*SD_LOAD_BALANCE \ 140 .flags = 1*SD_LOAD_BALANCE \
145 | 1*SD_BALANCE_NEWIDLE \ 141 | 1*SD_BALANCE_NEWIDLE \
146 | 1*SD_BALANCE_EXEC \ 142 | 1*SD_BALANCE_EXEC \
147 | 1*SD_BALANCE_FORK \ 143 | 1*SD_BALANCE_FORK \
148 | 0*SD_WAKE_IDLE \ 144 | 0*SD_BALANCE_WAKE \
149 | 1*SD_WAKE_AFFINE \ 145 | 1*SD_WAKE_AFFINE \
150 | 1*SD_WAKE_BALANCE \
151 | 0*SD_SHARE_CPUPOWER \ 146 | 0*SD_SHARE_CPUPOWER \
152 | 0*SD_POWERSAVINGS_BALANCE \ 147 | 0*SD_POWERSAVINGS_BALANCE \
153 | 0*SD_SHARE_PKG_RESOURCES \ 148 | 0*SD_SHARE_PKG_RESOURCES \
154 | 1*SD_SERIALIZE \ 149 | 1*SD_SERIALIZE \
155 | 1*SD_WAKE_IDLE_FAR \
156 | 0*SD_PREFER_SIBLING \ 150 | 0*SD_PREFER_SIBLING \
157 , \ 151 , \
158 .last_balance = jiffies, \ 152 .last_balance = jiffies, \
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..8dd30638fe44 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..4109679863c1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79 74
80DEFINE_TRACE(power_mark); 75DEFINE_TRACE(power_mark);
81 76
@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 239 return cmd.val;
245} 240}
246 241
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 242/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 243static void read_measured_perf_ctrs(void *_cur)
259{ 244{
260 struct perf_pair *cur = _cur; 245 struct aperfmperf *am = _cur;
261 246
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 247 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 248}
265 249
266/* 250/*
@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 263static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 264 unsigned int cpu)
281{ 265{
282 struct perf_pair readin, cur; 266 struct aperfmperf perf;
283 unsigned int perf_percent; 267 unsigned long ratio;
284 unsigned int retval; 268 unsigned int retval;
285 269
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 270 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 271 return 0;
288 272
289 cur.aperf.whole = readin.aperf.whole - 273 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 274 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323 275
324#else 276 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
339 277
340 return retval; 278 return retval;
341} 279}
@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 669 acpi_processor_notify_smm(THIS_MODULE);
732 670
733 /* Check for APERF/MPERF support in hardware */ 671 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 672 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 673 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 674
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 675 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 676 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b5..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
351 } 351 }
352 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
353 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
354 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
355 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3d74bd04d18..8af3d249170e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -190,6 +190,7 @@ extern unsigned long long time_sync_thresh;
190/* in tsk->state again */ 190/* in tsk->state again */
191#define TASK_DEAD 64 191#define TASK_DEAD 64
192#define TASK_WAKEKILL 128 192#define TASK_WAKEKILL 128
193#define TASK_WAKING 256
193 194
194/* Convenience macros for the sake of set_task_state */ 195/* Convenience macros for the sake of set_task_state */
195#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) 196#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -802,14 +803,14 @@ enum cpu_idle_type {
802#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ 803#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
803#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ 804#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
804#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ 805#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
805#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ 806#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
806#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 807#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
807#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ 808#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
808#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 809#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
809#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ 810#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
810#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 811#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
811#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 812#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
812#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ 813
813#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 814#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
814 815
815enum powersavings_balance_level { 816enum powersavings_balance_level {
@@ -991,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag)
991 return 0; 992 return 0;
992} 993}
993 994
995unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
996unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
997
994#else /* CONFIG_SMP */ 998#else /* CONFIG_SMP */
995 999
996struct sched_domain_attr; 1000struct sched_domain_attr;
@@ -1002,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
1002} 1006}
1003#endif /* !CONFIG_SMP */ 1007#endif /* !CONFIG_SMP */
1004 1008
1009
1005struct io_context; /* See blkdev.h */ 1010struct io_context; /* See blkdev.h */
1006 1011
1007 1012
@@ -1019,6 +1024,12 @@ struct uts_namespace;
1019struct rq; 1024struct rq;
1020struct sched_domain; 1025struct sched_domain;
1021 1026
1027/*
1028 * wake flags
1029 */
1030#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
1031#define WF_FORK 0x02 /* child wakeup after fork */
1032
1022struct sched_class { 1033struct sched_class {
1023 const struct sched_class *next; 1034 const struct sched_class *next;
1024 1035
@@ -1026,13 +1037,13 @@ struct sched_class {
1026 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 1037 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
1027 void (*yield_task) (struct rq *rq); 1038 void (*yield_task) (struct rq *rq);
1028 1039
1029 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); 1040 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1030 1041
1031 struct task_struct * (*pick_next_task) (struct rq *rq); 1042 struct task_struct * (*pick_next_task) (struct rq *rq);
1032 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1043 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1033 1044
1034#ifdef CONFIG_SMP 1045#ifdef CONFIG_SMP
1035 int (*select_task_rq)(struct task_struct *p, int sync); 1046 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1036 1047
1037 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, 1048 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
1038 struct rq *busiest, unsigned long max_load_move, 1049 struct rq *busiest, unsigned long max_load_move,
@@ -1102,6 +1113,8 @@ struct sched_entity {
1102 u64 start_runtime; 1113 u64 start_runtime;
1103 u64 avg_wakeup; 1114 u64 avg_wakeup;
1104 1115
1116 u64 avg_running;
1117
1105#ifdef CONFIG_SCHEDSTATS 1118#ifdef CONFIG_SCHEDSTATS
1106 u64 wait_start; 1119 u64 wait_start;
1107 u64 wait_max; 1120 u64 wait_max;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 85e8cf7d393c..809b26c07090 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
95 | 1*SD_BALANCE_NEWIDLE \ 95 | 1*SD_BALANCE_NEWIDLE \
96 | 1*SD_BALANCE_EXEC \ 96 | 1*SD_BALANCE_EXEC \
97 | 1*SD_BALANCE_FORK \ 97 | 1*SD_BALANCE_FORK \
98 | 0*SD_WAKE_IDLE \ 98 | 0*SD_BALANCE_WAKE \
99 | 1*SD_WAKE_AFFINE \ 99 | 1*SD_WAKE_AFFINE \
100 | 1*SD_WAKE_BALANCE \
101 | 1*SD_SHARE_CPUPOWER \ 100 | 1*SD_SHARE_CPUPOWER \
102 | 0*SD_POWERSAVINGS_BALANCE \ 101 | 0*SD_POWERSAVINGS_BALANCE \
103 | 0*SD_SHARE_PKG_RESOURCES \ 102 | 0*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \ 103 | 0*SD_SERIALIZE \
105 | 0*SD_WAKE_IDLE_FAR \
106 | 0*SD_PREFER_SIBLING \ 104 | 0*SD_PREFER_SIBLING \
107 , \ 105 , \
108 .last_balance = jiffies, \ 106 .last_balance = jiffies, \
@@ -122,20 +120,19 @@ int arch_update_cpu_topology(void);
122 .imbalance_pct = 125, \ 120 .imbalance_pct = 125, \
123 .cache_nice_tries = 1, \ 121 .cache_nice_tries = 1, \
124 .busy_idx = 2, \ 122 .busy_idx = 2, \
125 .wake_idx = 1, \ 123 .wake_idx = 0, \
126 .forkexec_idx = 1, \ 124 .forkexec_idx = 0, \
127 \ 125 \
128 .flags = 1*SD_LOAD_BALANCE \ 126 .flags = 1*SD_LOAD_BALANCE \
129 | 1*SD_BALANCE_NEWIDLE \ 127 | 1*SD_BALANCE_NEWIDLE \
130 | 1*SD_BALANCE_EXEC \ 128 | 1*SD_BALANCE_EXEC \
131 | 1*SD_BALANCE_FORK \ 129 | 1*SD_BALANCE_FORK \
132 | 1*SD_WAKE_IDLE \ 130 | 0*SD_BALANCE_WAKE \
133 | 1*SD_WAKE_AFFINE \ 131 | 1*SD_WAKE_AFFINE \
134 | 1*SD_WAKE_BALANCE \ 132 | 1*SD_PREFER_LOCAL \
135 | 0*SD_SHARE_CPUPOWER \ 133 | 0*SD_SHARE_CPUPOWER \
136 | 1*SD_SHARE_PKG_RESOURCES \ 134 | 1*SD_SHARE_PKG_RESOURCES \
137 | 0*SD_SERIALIZE \ 135 | 0*SD_SERIALIZE \
138 | 0*SD_WAKE_IDLE_FAR \
139 | sd_balance_for_mc_power() \ 136 | sd_balance_for_mc_power() \
140 | sd_power_saving_flags() \ 137 | sd_power_saving_flags() \
141 , \ 138 , \
@@ -155,21 +152,20 @@ int arch_update_cpu_topology(void);
155 .cache_nice_tries = 1, \ 152 .cache_nice_tries = 1, \
156 .busy_idx = 2, \ 153 .busy_idx = 2, \
157 .idle_idx = 1, \ 154 .idle_idx = 1, \
158 .newidle_idx = 2, \ 155 .newidle_idx = 0, \
159 .wake_idx = 1, \ 156 .wake_idx = 0, \
160 .forkexec_idx = 1, \ 157 .forkexec_idx = 0, \
161 \ 158 \
162 .flags = 1*SD_LOAD_BALANCE \ 159 .flags = 1*SD_LOAD_BALANCE \
163 | 1*SD_BALANCE_NEWIDLE \ 160 | 1*SD_BALANCE_NEWIDLE \
164 | 1*SD_BALANCE_EXEC \ 161 | 1*SD_BALANCE_EXEC \
165 | 1*SD_BALANCE_FORK \ 162 | 1*SD_BALANCE_FORK \
166 | 1*SD_WAKE_IDLE \ 163 | 0*SD_BALANCE_WAKE \
167 | 0*SD_WAKE_AFFINE \ 164 | 1*SD_WAKE_AFFINE \
168 | 1*SD_WAKE_BALANCE \ 165 | 1*SD_PREFER_LOCAL \
169 | 0*SD_SHARE_CPUPOWER \ 166 | 0*SD_SHARE_CPUPOWER \
170 | 0*SD_SHARE_PKG_RESOURCES \ 167 | 0*SD_SHARE_PKG_RESOURCES \
171 | 0*SD_SERIALIZE \ 168 | 0*SD_SERIALIZE \
172 | 0*SD_WAKE_IDLE_FAR \
173 | sd_balance_for_package_power() \ 169 | sd_balance_for_package_power() \
174 | sd_power_saving_flags() \ 170 | sd_power_saving_flags() \
175 , \ 171 , \
@@ -191,14 +187,12 @@ int arch_update_cpu_topology(void);
191 | 1*SD_BALANCE_NEWIDLE \ 187 | 1*SD_BALANCE_NEWIDLE \
192 | 0*SD_BALANCE_EXEC \ 188 | 0*SD_BALANCE_EXEC \
193 | 0*SD_BALANCE_FORK \ 189 | 0*SD_BALANCE_FORK \
194 | 0*SD_WAKE_IDLE \ 190 | 0*SD_BALANCE_WAKE \
195 | 1*SD_WAKE_AFFINE \ 191 | 0*SD_WAKE_AFFINE \
196 | 0*SD_WAKE_BALANCE \
197 | 0*SD_SHARE_CPUPOWER \ 192 | 0*SD_SHARE_CPUPOWER \
198 | 0*SD_POWERSAVINGS_BALANCE \ 193 | 0*SD_POWERSAVINGS_BALANCE \
199 | 0*SD_SHARE_PKG_RESOURCES \ 194 | 0*SD_SHARE_PKG_RESOURCES \
200 | 1*SD_SERIALIZE \ 195 | 1*SD_SERIALIZE \
201 | 1*SD_WAKE_IDLE_FAR \
202 | 0*SD_PREFER_SIBLING \ 196 | 0*SD_PREFER_SIBLING \
203 , \ 197 , \
204 .last_balance = jiffies, \ 198 .last_balance = jiffies, \
diff --git a/include/linux/wait.h b/include/linux/wait.h
index cf3c2f5dba51..a48e16b77d5e 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -26,8 +26,8 @@
26#include <asm/current.h> 26#include <asm/current.h>
27 27
28typedef struct __wait_queue wait_queue_t; 28typedef struct __wait_queue wait_queue_t;
29typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key); 29typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
30int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 30int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
31 31
32struct __wait_queue { 32struct __wait_queue {
33 unsigned int flags; 33 unsigned int flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index d9db3fb17573..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 376
379#else 377#else
380 378
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
390{ 381{
@@ -514,14 +505,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
515 struct cpupri cpupri; 506 struct cpupri cpupri;
516#endif 507#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 508};
526 509
527/* 510/*
@@ -646,9 +629,10 @@ struct rq {
646 629
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 631
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 634{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 636}
653 637
654static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -1509,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1493#endif
1510 1494
1511#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1496/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1497static unsigned long weighted_cpuload(const int cpu)
1498{
1499 return cpu_rq(cpu)->load.weight;
1500}
1501
1502/*
1503 * Return a low guess at the load of a migration-source cpu weighted
1504 * according to the scheduling class and "nice" value.
1505 *
1506 * We want to under-estimate the load of migration sources, to
1507 * balance conservatively.
1508 */
1509static unsigned long source_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return min(rq->cpu_load[type-1], total);
1518}
1519
1520/*
1521 * Return a high guess at the load of a migration-target cpu weighted
1522 * according to the scheduling class and "nice" value.
1523 */
1524static unsigned long target_load(int cpu, int type)
1525{
1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long total = weighted_cpuload(cpu);
1528
1529 if (type == 0 || !sched_feat(LB_BIAS))
1530 return total;
1531
1532 return max(rq->cpu_load[type-1], total);
1533}
1534
1535static struct sched_group *group_of(int cpu)
1536{
1537 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1538
1539 if (!sd)
1540 return NULL;
1541
1542 return sd->groups;
1543}
1544
1545static unsigned long power_of(int cpu)
1546{
1547 struct sched_group *group = group_of(cpu);
1548
1549 if (!group)
1550 return SCHED_LOAD_SCALE;
1551
1552 return group->cpu_power;
1553}
1554
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1556
1516static unsigned long cpu_avg_load_per_task(int cpu) 1557static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1736
1696#ifdef CONFIG_PREEMPT 1737#ifdef CONFIG_PREEMPT
1697 1738
1739static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1740
1698/* 1741/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1742 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1743 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1959} 2002}
1960 2003
1961#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1962
1963/* Used instead of source_load when we know the type == 0 */
1964static unsigned long weighted_cpuload(const int cpu)
1965{
1966 return cpu_rq(cpu)->load.weight;
1967}
1968
1969/* 2005/*
1970 * Is this task likely cache-hot: 2006 * Is this task likely cache-hot:
1971 */ 2007 */
@@ -2239,185 +2275,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2275 preempt_enable();
2240} 2276}
2241EXPORT_SYMBOL_GPL(kick_process); 2277EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2278#endif /* CONFIG_SMP */
2422 2279
2423/** 2280/**
@@ -2455,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2312 *
2456 * returns failure only if the task is already active. 2313 * returns failure only if the task is already active.
2457 */ 2314 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2315static int try_to_wake_up(struct task_struct *p, unsigned int state,
2316 int wake_flags)
2459{ 2317{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2318 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2319 unsigned long flags;
2462 long old_state;
2463 struct rq *rq; 2320 struct rq *rq;
2464 2321
2465 if (!sched_feat(SYNC_WAKEUPS)) 2322 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2323 wake_flags &= ~WF_SYNC;
2467
2468#ifdef CONFIG_SMP
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471 2324
2472 this_cpu = raw_smp_processor_id(); 2325 this_cpu = get_cpu();
2473 cpu = task_cpu(p);
2474
2475 for_each_domain(this_cpu, sd) {
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2326
2484 smp_wmb(); 2327 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2328 rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2329 update_rq_clock(rq);
2487 old_state = p->state; 2330 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2331 goto out;
2490 2332
2491 if (p->se.on_rq) 2333 if (p->se.on_rq)
@@ -2493,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2335
2494 cpu = task_cpu(p); 2336 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2337 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2338
2498#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2340 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2341 goto out_activate;
2501 2342
2502 cpu = p->sched_class->select_task_rq(p, sync); 2343 /*
2503 if (cpu != orig_cpu) { 2344 * In order to handle concurrent wakeups and release the rq->lock
2345 * we put the task in TASK_WAKING state.
2346 *
2347 * First fix up the nr_uninterruptible count:
2348 */
2349 if (task_contributes_to_load(p))
2350 rq->nr_uninterruptible--;
2351 p->state = TASK_WAKING;
2352 task_rq_unlock(rq, &flags);
2353
2354 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2355 if (cpu != orig_cpu)
2504 set_task_cpu(p, cpu); 2356 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513 2357
2514 this_cpu = smp_processor_id(); 2358 rq = task_rq_lock(p, &flags);
2515 cpu = task_cpu(p); 2359 WARN_ON(p->state != TASK_WAKING);
2516 } 2360 cpu = task_cpu(p);
2517 2361
2518#ifdef CONFIG_SCHEDSTATS 2362#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2363 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2377out_activate:
2534#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2379 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2380 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2381 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2382 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2383 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2406,7 @@ out_activate:
2562 2406
2563out_running: 2407out_running:
2564 trace_sched_wakeup(rq, p, success); 2408 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2409 check_preempt_curr(rq, p, wake_flags);
2566 2410
2567 p->state = TASK_RUNNING; 2411 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2412#ifdef CONFIG_SMP
@@ -2571,6 +2415,7 @@ out_running:
2571#endif 2415#endif
2572out: 2416out:
2573 task_rq_unlock(rq, &flags); 2417 task_rq_unlock(rq, &flags);
2418 put_cpu();
2574 2419
2575 return success; 2420 return success;
2576} 2421}
@@ -2613,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2458 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2459 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2460 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2461 p->se.avg_running = 0;
2616 2462
2617#ifdef CONFIG_SCHEDSTATS 2463#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2464 p->se.wait_start = 0;
@@ -2674,11 +2520,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2674 2520
2675 __sched_fork(p); 2521 __sched_fork(p);
2676 2522
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /* 2523 /*
2683 * Make sure we do not leak PI boosting priority to the child. 2524 * Make sure we do not leak PI boosting priority to the child.
2684 */ 2525 */
@@ -2709,6 +2550,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2709 if (!rt_prio(p->prio)) 2550 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2551 p->sched_class = &fair_sched_class;
2711 2552
2553#ifdef CONFIG_SMP
2554 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2555#endif
2556 set_task_cpu(p, cpu);
2557
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2558#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2559 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2560 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2754,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2600 inc_nr_running(rq);
2755 } 2601 }
2756 trace_sched_wakeup_new(rq, p, 1); 2602 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2603 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2604#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2605 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2606 p->sched_class->task_wake_up(rq, p);
@@ -3263,7 +3109,7 @@ out:
3263void sched_exec(void) 3109void sched_exec(void)
3264{ 3110{
3265 int new_cpu, this_cpu = get_cpu(); 3111 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3112 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3113 put_cpu();
3268 if (new_cpu != this_cpu) 3114 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3115 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3529 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3530 sds->busiest = sds->group_min;
3685 3531
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3532 return 1;
3692 3533
3693} 3534}
@@ -3711,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3552}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3553#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3554
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3555
3556unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3557{
3558 return SCHED_LOAD_SCALE;
3559}
3560
3561unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3562{
3563 return default_scale_freq_power(sd, cpu);
3564}
3565
3566unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3567{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3568 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3569 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3573 return smt_gain;
3722} 3574}
3723 3575
3576unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3577{
3578 return default_scale_smt_power(sd, cpu);
3579}
3580
3724unsigned long scale_rt_power(int cpu) 3581unsigned long scale_rt_power(int cpu)
3725{ 3582{
3726 struct rq *rq = cpu_rq(cpu); 3583 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3602 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3603 struct sched_group *sdg = sd->groups;
3747 3604
3748 /* here we could scale based on cpufreq */ 3605 if (sched_feat(ARCH_POWER))
3606 power *= arch_scale_freq_power(sd, cpu);
3607 else
3608 power *= default_scale_freq_power(sd, cpu);
3609
3610 power >>= SCHED_LOAD_SHIFT;
3749 3611
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3612 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3613 if (sched_feat(ARCH_POWER))
3614 power *= arch_scale_smt_power(sd, cpu);
3615 else
3616 power *= default_scale_smt_power(sd, cpu);
3617
3752 power >>= SCHED_LOAD_SHIFT; 3618 power >>= SCHED_LOAD_SHIFT;
3753 } 3619 }
3754 3620
@@ -4161,26 +4027,6 @@ ret:
4161 return NULL; 4027 return NULL;
4162} 4028}
4163 4029
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4030/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4031 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4032 */
@@ -5465,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5311#endif
5466} 5312}
5467 5313
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5314static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5315{
5470 if (prev->state == TASK_RUNNING) { 5316 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5317
5473 runtime -= prev->se.prev_sum_exec_runtime; 5318 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5319
5320 if (p->state == TASK_RUNNING) {
5476 /* 5321 /*
5477 * In order to avoid avg_overlap growing stale when we are 5322 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5323 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5327 * correlates to the amount of cache footprint a task can
5483 * build up. 5328 * build up.
5484 */ 5329 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5330 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5331 update_avg(&p->se.avg_overlap, runtime);
5332 } else {
5333 update_avg(&p->se.avg_running, 0);
5486 } 5334 }
5487 prev->sched_class->put_prev_task(rq, prev); 5335 p->sched_class->put_prev_task(rq, p);
5488} 5336}
5489 5337
5490/* 5338/*
@@ -5716,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5564
5717#endif /* CONFIG_PREEMPT */ 5565#endif /* CONFIG_PREEMPT */
5718 5566
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5567int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5568 void *key)
5721{ 5569{
5722 return try_to_wake_up(curr->private, mode, sync); 5570 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5571}
5724EXPORT_SYMBOL(default_wake_function); 5572EXPORT_SYMBOL(default_wake_function);
5725 5573
@@ -5733,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5581 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5582 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5583static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5584 int nr_exclusive, int wake_flags, void *key)
5737{ 5585{
5738 wait_queue_t *curr, *next; 5586 wait_queue_t *curr, *next;
5739 5587
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5588 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5589 unsigned flags = curr->flags;
5742 5590
5743 if (curr->func(curr, mode, sync, key) && 5591 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5593 break;
5746 } 5594 }
@@ -5801,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5649 int nr_exclusive, void *key)
5802{ 5650{
5803 unsigned long flags; 5651 unsigned long flags;
5804 int sync = 1; 5652 int wake_flags = WF_SYNC;
5805 5653
5806 if (unlikely(!q)) 5654 if (unlikely(!q))
5807 return; 5655 return;
5808 5656
5809 if (unlikely(!nr_exclusive)) 5657 if (unlikely(!nr_exclusive))
5810 sync = 0; 5658 wake_flags = 0;
5811 5659
5812 spin_lock_irqsave(&q->lock, flags); 5660 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5661 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5662 spin_unlock_irqrestore(&q->lock, flags);
5815} 5663}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5664EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -8000,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7848 }
8001 7849
8002 /* Following flags don't use groups */ 7850 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7851 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7852 return 0;
8007 7853
8008 return 1; 7854 return 1;
@@ -8019,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7865 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7866 return 0;
8021 7867
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7868 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7869 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7870 pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8550,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8550 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8551 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8552 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8553 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8554 } else {
8713 /* turn on idle balance on this domain */ 8555 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8556 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8557 }
8716} 8558}
8717 8559
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..10d218ab69f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
711 711
712 if (!initial) { 712 if (!initial) {
713 /* sleeps upto a single latency don't count. */ 713 /* sleeps upto a single latency don't count. */
714 if (sched_feat(NEW_FAIR_SLEEPERS)) { 714 if (sched_feat(FAIR_SLEEPERS)) {
715 unsigned long thresh = sysctl_sched_latency; 715 unsigned long thresh = sysctl_sched_latency;
716 716
717 /* 717 /*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
725 task_of(se)->policy != SCHED_IDLE)) 725 task_of(se)->policy != SCHED_IDLE))
726 thresh = calc_delta_fair(thresh, se); 726 thresh = calc_delta_fair(thresh, se);
727 727
728 /*
729 * Halve their sleep time's effect, to allow
730 * for a gentler effect of sleepers:
731 */
732 if (sched_feat(GENTLE_FAIR_SLEEPERS))
733 thresh >>= 1;
734
728 vruntime -= thresh; 735 vruntime -= thresh;
729 } 736 }
730 } 737 }
@@ -757,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
757 764
758static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 765static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
759{ 766{
760 if (cfs_rq->last == se) 767 if (!se || cfs_rq->last == se)
761 cfs_rq->last = NULL; 768 cfs_rq->last = NULL;
762 769
763 if (cfs_rq->next == se) 770 if (!se || cfs_rq->next == se)
764 cfs_rq->next = NULL; 771 cfs_rq->next = NULL;
765} 772}
766 773
@@ -1062,83 +1069,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1069 se->vruntime = rightmost->vruntime + 1;
1063} 1070}
1064 1071
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1072#ifdef CONFIG_SMP
1143 1073
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1074#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1155
1226#endif 1156#endif
1227 1157
1228static int 1158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1159{
1234 struct task_struct *curr = this_rq->curr; 1160 struct task_struct *curr = current;
1235 struct task_group *tg; 1161 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1162 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1163 unsigned long tl_per_task;
1164 unsigned int imbalance;
1165 struct task_group *tg;
1238 unsigned long weight; 1166 unsigned long weight;
1239 int balanced; 1167 int balanced;
1240 1168
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1169 idx = sd->wake_idx;
1242 return 0; 1170 this_cpu = smp_processor_id();
1171 prev_cpu = task_cpu(p);
1172 load = source_load(prev_cpu, idx);
1173 this_load = target_load(this_cpu, idx);
1243 1174
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1175 if (sync) {
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1176 if (sched_feat(SYNC_LESS) &&
1246 sync = 0; 1177 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1178 p->se.avg_overlap > sysctl_sched_migration_cost))
1179 sync = 0;
1180 } else {
1181 if (sched_feat(SYNC_MORE) &&
1182 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1183 p->se.avg_overlap < sysctl_sched_migration_cost))
1184 sync = 1;
1185 }
1247 1186
1248 /* 1187 /*
1249 * If sync wakeup then subtract the (maximum possible) 1188 * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1193 tg = task_group(current);
1255 weight = current->se.load.weight; 1194 weight = current->se.load.weight;
1256 1195
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1196 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1197 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1198 }
1260 1199
1261 tg = task_group(p); 1200 tg = task_group(p);
1262 weight = p->se.load.weight; 1201 weight = p->se.load.weight;
1263 1202
1203 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1204
1264 /* 1205 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1206 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1207 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1208 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1209 * about that, so that's good too.
1269 * 1210 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1211 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1212 * task to be woken on this_cpu.
1272 */ 1213 */
1273 balanced = !tl || 1214 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1215 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1216 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1217
1277 /* 1218 /*
@@ -1285,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1226 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1227 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1228
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1229 if (balanced ||
1289 tl_per_task)) { 1230 (this_load <= load &&
1231 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1232 /*
1291 * This domain has SD_WAKE_AFFINE and 1233 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1234 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1235 * there is no bad imbalance.
1294 */ 1236 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1237 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1238 schedstat_inc(p, se.nr_wakeups_affine);
1297 1239
1298 return 1; 1240 return 1;
@@ -1300,65 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1242 return 0;
1301} 1243}
1302 1244
1303static int select_task_rq_fair(struct task_struct *p, int sync) 1245/*
1246 * find_idlest_group finds and returns the least busy CPU group within the
1247 * domain.
1248 */
1249static struct sched_group *
1250find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1251 int this_cpu, int load_idx)
1304{ 1252{
1305 struct sched_domain *sd, *this_sd = NULL; 1253 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1306 int prev_cpu, this_cpu, new_cpu; 1254 unsigned long min_load = ULONG_MAX, this_load = 0;
1307 unsigned long load, this_load; 1255 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1308 struct rq *this_rq;
1309 unsigned int imbalance;
1310 int idx;
1311 1256
1312 prev_cpu = task_cpu(p); 1257 do {
1313 this_cpu = smp_processor_id(); 1258 unsigned long load, avg_load;
1314 this_rq = cpu_rq(this_cpu); 1259 int local_group;
1315 new_cpu = prev_cpu; 1260 int i;
1316 1261
1317 /* 1262 /* Skip over this group if it has no CPUs allowed */
1318 * 'this_sd' is the first domain that both 1263 if (!cpumask_intersects(sched_group_cpus(group),
1319 * this_cpu and prev_cpu are present in: 1264 &p->cpus_allowed))
1320 */ 1265 continue;
1321 for_each_domain(this_cpu, sd) { 1266
1322 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1267 local_group = cpumask_test_cpu(this_cpu,
1323 this_sd = sd; 1268 sched_group_cpus(group));
1324 break; 1269
1270 /* Tally up the load of all CPUs in the group */
1271 avg_load = 0;
1272
1273 for_each_cpu(i, sched_group_cpus(group)) {
1274 /* Bias balancing toward cpus of our domain */
1275 if (local_group)
1276 load = source_load(i, load_idx);
1277 else
1278 load = target_load(i, load_idx);
1279
1280 avg_load += load;
1281 }
1282
1283 /* Adjust by relative CPU power of the group */
1284 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1285
1286 if (local_group) {
1287 this_load = avg_load;
1288 this = group;
1289 } else if (avg_load < min_load) {
1290 min_load = avg_load;
1291 idlest = group;
1292 }
1293 } while (group = group->next, group != sd->groups);
1294
1295 if (!idlest || 100*this_load < imbalance*min_load)
1296 return NULL;
1297 return idlest;
1298}
1299
1300/*
1301 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1302 */
1303static int
1304find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1305{
1306 unsigned long load, min_load = ULONG_MAX;
1307 int idlest = -1;
1308 int i;
1309
1310 /* Traverse only the allowed CPUs */
1311 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1312 load = weighted_cpuload(i);
1313
1314 if (load < min_load || (load == min_load && i == this_cpu)) {
1315 min_load = load;
1316 idlest = i;
1325 } 1317 }
1326 } 1318 }
1327 1319
1328 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1320 return idlest;
1329 goto out; 1321}
1330 1322
1331 /* 1323/*
1332 * Check for affine wakeup and passive balancing possibilities. 1324 * sched_balance_self: balance the current task (running on cpu) in domains
1333 */ 1325 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1334 if (!this_sd) 1326 * SD_BALANCE_EXEC.
1327 *
1328 * Balance, ie. select the least loaded group.
1329 *
1330 * Returns the target CPU number, or the same CPU if no balancing is needed.
1331 *
1332 * preempt must be disabled.
1333 */
1334static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1335{
1336 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1337 int cpu = smp_processor_id();
1338 int prev_cpu = task_cpu(p);
1339 int new_cpu = cpu;
1340 int want_affine = 0;
1341 int want_sd = 1;
1342 int sync = wake_flags & WF_SYNC;
1343
1344 if (sd_flag & SD_BALANCE_WAKE) {
1345 if (sched_feat(AFFINE_WAKEUPS))
1346 want_affine = 1;
1347 new_cpu = prev_cpu;
1348 }
1349
1350 rcu_read_lock();
1351 for_each_domain(cpu, tmp) {
1352 /*
1353 * If power savings logic is enabled for a domain, see if we
1354 * are not overloaded, if so, don't balance wider.
1355 */
1356 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1357 unsigned long power = 0;
1358 unsigned long nr_running = 0;
1359 unsigned long capacity;
1360 int i;
1361
1362 for_each_cpu(i, sched_domain_span(tmp)) {
1363 power += power_of(i);
1364 nr_running += cpu_rq(i)->cfs.nr_running;
1365 }
1366
1367 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1368
1369 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1370 nr_running /= 2;
1371
1372 if (nr_running < capacity)
1373 want_sd = 0;
1374 }
1375
1376 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1377 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1378
1379 affine_sd = tmp;
1380 want_affine = 0;
1381 }
1382
1383 if (!want_sd && !want_affine)
1384 break;
1385
1386 if (!(tmp->flags & sd_flag))
1387 continue;
1388
1389 if (want_sd)
1390 sd = tmp;
1391 }
1392
1393 if (sched_feat(LB_SHARES_UPDATE)) {
1394 /*
1395 * Pick the largest domain to update shares over
1396 */
1397 tmp = sd;
1398 if (affine_sd && (!tmp ||
1399 cpumask_weight(sched_domain_span(affine_sd)) >
1400 cpumask_weight(sched_domain_span(sd))))
1401 tmp = affine_sd;
1402
1403 if (tmp)
1404 update_shares(tmp);
1405 }
1406
1407 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1408 new_cpu = cpu;
1335 goto out; 1409 goto out;
1410 }
1336 1411
1337 idx = this_sd->wake_idx; 1412 while (sd) {
1413 int load_idx = sd->forkexec_idx;
1414 struct sched_group *group;
1415 int weight;
1338 1416
1339 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1417 if (!(sd->flags & sd_flag)) {
1418 sd = sd->child;
1419 continue;
1420 }
1340 1421
1341 load = source_load(prev_cpu, idx); 1422 if (sd_flag & SD_BALANCE_WAKE)
1342 this_load = target_load(this_cpu, idx); 1423 load_idx = sd->wake_idx;
1343 1424
1344 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1425 group = find_idlest_group(sd, p, cpu, load_idx);
1345 load, this_load, imbalance)) 1426 if (!group) {
1346 return this_cpu; 1427 sd = sd->child;
1428 continue;
1429 }
1347 1430
1348 /* 1431 new_cpu = find_idlest_cpu(group, p, cpu);
1349 * Start passive balancing when half the imbalance_pct 1432 if (new_cpu == -1 || new_cpu == cpu) {
1350 * limit is reached. 1433 /* Now try balancing at a lower domain level of cpu */
1351 */ 1434 sd = sd->child;
1352 if (this_sd->flags & SD_WAKE_BALANCE) { 1435 continue;
1353 if (imbalance*this_load <= 100*load) {
1354 schedstat_inc(this_sd, ttwu_move_balance);
1355 schedstat_inc(p, se.nr_wakeups_passive);
1356 return this_cpu;
1357 } 1436 }
1437
1438 /* Now try balancing at a lower domain level of new_cpu */
1439 cpu = new_cpu;
1440 weight = cpumask_weight(sched_domain_span(sd));
1441 sd = NULL;
1442 for_each_domain(cpu, tmp) {
1443 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1444 break;
1445 if (tmp->flags & sd_flag)
1446 sd = tmp;
1447 }
1448 /* while loop will break here if sd == NULL */
1358 } 1449 }
1359 1450
1360out: 1451out:
1361 return wake_idle(new_cpu, p); 1452 rcu_read_unlock();
1453 return new_cpu;
1362} 1454}
1363#endif /* CONFIG_SMP */ 1455#endif /* CONFIG_SMP */
1364 1456
@@ -1471,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
1471/* 1563/*
1472 * Preempt the current task with a newly woken task if needed: 1564 * Preempt the current task with a newly woken task if needed:
1473 */ 1565 */
1474static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1566static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1475{ 1567{
1476 struct task_struct *curr = rq->curr; 1568 struct task_struct *curr = rq->curr;
1477 struct sched_entity *se = &curr->se, *pse = &p->se; 1569 struct sched_entity *se = &curr->se, *pse = &p->se;
1478 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1570 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1571 int sync = wake_flags & WF_SYNC;
1479 1572
1480 update_curr(cfs_rq); 1573 update_curr(cfs_rq);
1481 1574
@@ -1501,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1501 */ 1594 */
1502 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1595 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1503 set_last_buddy(se); 1596 set_last_buddy(se);
1504 set_next_buddy(pse); 1597 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1598 set_next_buddy(pse);
1505 1599
1506 /* 1600 /*
1507 * We can come here with TIF_NEED_RESCHED already set from new task 1601 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1523 return; 1617 return;
1524 } 1618 }
1525 1619
1526 if (!sched_feat(WAKEUP_PREEMPT)) 1620 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1527 return; 1621 (sched_feat(WAKEUP_OVERLAP) &&
1528 1622 (se->avg_overlap < sysctl_sched_migration_cost &&
1529 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1623 pse->avg_overlap < sysctl_sched_migration_cost))) {
1530 (se->avg_overlap < sysctl_sched_migration_cost &&
1531 pse->avg_overlap < sysctl_sched_migration_cost))) {
1532 resched_task(curr); 1624 resched_task(curr);
1533 return; 1625 return;
1534 } 1626 }
1535 1627
1628 if (sched_feat(WAKEUP_RUNNING)) {
1629 if (pse->avg_running < se->avg_running) {
1630 set_next_buddy(pse);
1631 resched_task(curr);
1632 return;
1633 }
1634 }
1635
1636 if (!sched_feat(WAKEUP_PREEMPT))
1637 return;
1638
1536 find_matching_se(&se, &pse); 1639 find_matching_se(&se, &pse);
1537 1640
1538 BUG_ON(!pse); 1641 BUG_ON(!pse);
@@ -1555,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1555 /* 1658 /*
1556 * If se was a buddy, clear it so that it will have to earn 1659 * If se was a buddy, clear it so that it will have to earn
1557 * the favour again. 1660 * the favour again.
1661 *
1662 * If se was not a buddy, clear the buddies because neither
1663 * was elegible to run, let them earn it again.
1664 *
1665 * IOW. unconditionally clear buddies.
1558 */ 1666 */
1559 __clear_buddies(cfs_rq, se); 1667 __clear_buddies(cfs_rq, NULL);
1560 set_next_entity(cfs_rq, se); 1668 set_next_entity(cfs_rq, se);
1561 cfs_rq = group_cfs_rq(se); 1669 cfs_rq = group_cfs_rq(se);
1562 } while (cfs_rq); 1670 } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
940 940
941static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
942{ 942{
943 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
944 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
945 /* 948 /*
946 * If the current task is an RT task, then 949 * If the current task is an RT task, then
947 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
999/* 1002/*
1000 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
1001 */ 1004 */
1002static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1003{ 1006{
1004 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
1005 resched_task(rq->curr); 1008 resched_task(rq->curr);