aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt10
-rw-r--r--arch/arm/include/asm/topology.h3
-rw-r--r--arch/arm/kernel/process.c16
-rw-r--r--arch/arm64/kernel/process.c7
-rw-r--r--arch/ia64/include/asm/topology.h1
-rw-r--r--arch/mips/include/asm/topology.h4
-rw-r--r--arch/powerpc/include/asm/topology.h1
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c1
-rw-r--r--arch/powerpc/platforms/powernv/setup.c13
-rw-r--r--arch/powerpc/platforms/pseries/setup.c34
-rw-r--r--arch/sh/kernel/idle.c4
-rw-r--r--arch/sparc/include/asm/smp_64.h1
-rw-r--r--arch/sparc/include/asm/topology_64.h2
-rw-r--r--arch/sparc/kernel/mdesc.c4
-rw-r--r--arch/sparc/kernel/prom_64.c3
-rw-r--r--arch/sparc/kernel/smp_64.c2
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--drivers/cpuidle/cpuidle-powernv.c5
-rw-r--r--drivers/cpuidle/cpuidle-pseries.c6
-rw-r--r--include/linux/sched.h44
-rw-r--r--include/linux/sched/prio.h44
-rw-r--r--include/linux/sched/rt.h26
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/rcu/rcutorture.c8
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/core.c207
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c56
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c600
-rw-r--r--kernel/sched/idle.c (renamed from kernel/cpu/idle.c)7
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c102
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/workqueue.c2
-rw-r--r--mm/mempolicy.c74
45 files changed, 913 insertions, 541 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index e1d28fbf7570..ec8be46bf48d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -442,8 +442,7 @@ feature should be disabled. Otherwise, if the system overhead from the
442feature is too high then the rate the kernel samples for NUMA hinting 442feature is too high then the rate the kernel samples for NUMA hinting
443faults may be controlled by the numa_balancing_scan_period_min_ms, 443faults may be controlled by the numa_balancing_scan_period_min_ms,
444numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, 444numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
445numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and 445numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
446numa_balancing_migrate_deferred.
447 446
448============================================================== 447==============================================================
449 448
@@ -484,13 +483,6 @@ rate for each task.
484numa_balancing_scan_size_mb is how many megabytes worth of pages are 483numa_balancing_scan_size_mb is how many megabytes worth of pages are
485scanned for a given scan. 484scanned for a given scan.
486 485
487numa_balancing_migrate_deferred is how many page migrations get skipped
488unconditionally, after a page migration is skipped because a page is shared
489with other tasks. This reduces page migration overhead, and determines
490how much stronger the "move task near its memory" policy scheduler becomes,
491versus the "move memory near its task" memory management policy, for workloads
492with shared memory.
493
494============================================================== 486==============================================================
495 487
496osrelease, ostype & version: 488osrelease, ostype & version:
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd2..2fe85fff5cca 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -20,9 +20,6 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23#define mc_capable() (cpu_topology[0].socket_id != -1)
24#define smt_capable() (cpu_topology[0].thread_id != -1)
25
26void init_cpu_topology(void); 23void init_cpu_topology(void);
27void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
28const struct cpumask *cpu_coregroup_mask(int cpu); 25const struct cpumask *cpu_coregroup_mask(int cpu);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 92f7b15dd221..adabeababeb0 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,7 +30,6 @@
30#include <linux/uaccess.h> 30#include <linux/uaccess.h>
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/hw_breakpoint.h> 32#include <linux/hw_breakpoint.h>
33#include <linux/cpuidle.h>
34#include <linux/leds.h> 33#include <linux/leds.h>
35#include <linux/reboot.h> 34#include <linux/reboot.h>
36 35
@@ -133,7 +132,11 @@ EXPORT_SYMBOL_GPL(arm_pm_restart);
133 132
134void (*arm_pm_idle)(void); 133void (*arm_pm_idle)(void);
135 134
136static void default_idle(void) 135/*
136 * Called from the core idle loop.
137 */
138
139void arch_cpu_idle(void)
137{ 140{
138 if (arm_pm_idle) 141 if (arm_pm_idle)
139 arm_pm_idle(); 142 arm_pm_idle();
@@ -168,15 +171,6 @@ void arch_cpu_idle_dead(void)
168#endif 171#endif
169 172
170/* 173/*
171 * Called from the core idle loop.
172 */
173void arch_cpu_idle(void)
174{
175 if (cpuidle_idle_call())
176 default_idle();
177}
178
179/*
180 * Called by kexec, immediately prior to machine_kexec(). 174 * Called by kexec, immediately prior to machine_kexec().
181 * 175 *
182 * This must completely disable all secondary CPUs; simply causing those CPUs 176 * This must completely disable all secondary CPUs; simply causing those CPUs
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1c0a9be2ffa8..9cce0098f4cd 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -33,7 +33,6 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/cpuidle.h>
37#include <linux/elfcore.h> 36#include <linux/elfcore.h>
38#include <linux/pm.h> 37#include <linux/pm.h>
39#include <linux/tick.h> 38#include <linux/tick.h>
@@ -94,10 +93,8 @@ void arch_cpu_idle(void)
94 * This should do all the clock switching and wait for interrupt 93 * This should do all the clock switching and wait for interrupt
95 * tricks 94 * tricks
96 */ 95 */
97 if (cpuidle_idle_call()) { 96 cpu_do_idle();
98 cpu_do_idle(); 97 local_irq_enable();
99 local_irq_enable();
100 }
101} 98}
102 99
103#ifdef CONFIG_HOTPLUG_CPU 100#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a2496e449b75..5cb55a1e606b 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -77,7 +77,6 @@ void build_cpu_to_node_map(void);
77#define topology_core_id(cpu) (cpu_data(cpu)->core_id) 77#define topology_core_id(cpu) (cpu_data(cpu)->core_id)
78#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 78#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
79#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 79#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
80#define smt_capable() (smp_num_siblings > 1)
81#endif 80#endif
82 81
83extern void arch_fix_phys_package_id(int num, u32 slot); 82extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 12609a17dc8b..20ea4859c822 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -10,8 +10,4 @@
10 10
11#include <topology.h> 11#include <topology.h>
12 12
13#ifdef CONFIG_SMP
14#define smt_capable() (smp_num_siblings > 1)
15#endif
16
17#endif /* __ASM_TOPOLOGY_H */ 13#endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index d0b5fca6b077..c9202151079f 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -99,7 +99,6 @@ static inline int prrn_is_enabled(void)
99 99
100#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
101#include <asm/cputable.h> 101#include <asm/cputable.h>
102#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
103 102
104#ifdef CONFIG_PPC64 103#ifdef CONFIG_PPC64
105#include <asm/smp.h> 104#include <asm/smp.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4fa..4a0a64fe25df 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer;
83#define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1) 83#define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
84#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) 84#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
85 85
86#define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO)
87#define SCALE_PRIO(x, prio) \ 86#define SCALE_PRIO(x, prio) \
88 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) 87 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
89 88
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 110f4fbd319f..81a7a0a79be7 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -26,7 +26,6 @@
26#include <linux/of_fdt.h> 26#include <linux/of_fdt.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/bug.h> 28#include <linux/bug.h>
29#include <linux/cpuidle.h>
30#include <linux/pci.h> 29#include <linux/pci.h>
31 30
32#include <asm/machdep.h> 31#include <asm/machdep.h>
@@ -225,16 +224,6 @@ static int __init pnv_probe(void)
225 return 1; 224 return 1;
226} 225}
227 226
228void powernv_idle(void)
229{
230 /* Hook to cpuidle framework if available, else
231 * call on default platform idle code
232 */
233 if (cpuidle_idle_call()) {
234 power7_idle();
235 }
236}
237
238define_machine(powernv) { 227define_machine(powernv) {
239 .name = "PowerNV", 228 .name = "PowerNV",
240 .probe = pnv_probe, 229 .probe = pnv_probe,
@@ -244,7 +233,7 @@ define_machine(powernv) {
244 .show_cpuinfo = pnv_show_cpuinfo, 233 .show_cpuinfo = pnv_show_cpuinfo,
245 .progress = pnv_progress, 234 .progress = pnv_progress,
246 .machine_shutdown = pnv_shutdown, 235 .machine_shutdown = pnv_shutdown,
247 .power_save = powernv_idle, 236 .power_save = power7_idle,
248 .calibrate_decr = generic_calibrate_decr, 237 .calibrate_decr = generic_calibrate_decr,
249 .dma_set_mask = pnv_dma_set_mask, 238 .dma_set_mask = pnv_dma_set_mask,
250#ifdef CONFIG_KEXEC 239#ifdef CONFIG_KEXEC
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 972df0ffd4dc..2db8cc691bf4 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -39,7 +39,6 @@
39#include <linux/irq.h> 39#include <linux/irq.h>
40#include <linux/seq_file.h> 40#include <linux/seq_file.h>
41#include <linux/root_dev.h> 41#include <linux/root_dev.h>
42#include <linux/cpuidle.h>
43#include <linux/of.h> 42#include <linux/of.h>
44#include <linux/kexec.h> 43#include <linux/kexec.h>
45 44
@@ -356,29 +355,24 @@ early_initcall(alloc_dispatch_log_kmem_cache);
356 355
357static void pseries_lpar_idle(void) 356static void pseries_lpar_idle(void)
358{ 357{
359 /* This would call on the cpuidle framework, and the back-end pseries 358 /*
360 * driver to go to idle states 359 * Default handler to go into low thread priority and possibly
360 * low power mode by cedeing processor to hypervisor
361 */ 361 */
362 if (cpuidle_idle_call()) {
363 /* On error, execute default handler
364 * to go into low thread priority and possibly
365 * low power mode by cedeing processor to hypervisor
366 */
367 362
368 /* Indicate to hypervisor that we are idle. */ 363 /* Indicate to hypervisor that we are idle. */
369 get_lppaca()->idle = 1; 364 get_lppaca()->idle = 1;
370 365
371 /* 366 /*
372 * Yield the processor to the hypervisor. We return if 367 * Yield the processor to the hypervisor. We return if
373 * an external interrupt occurs (which are driven prior 368 * an external interrupt occurs (which are driven prior
374 * to returning here) or if a prod occurs from another 369 * to returning here) or if a prod occurs from another
375 * processor. When returning here, external interrupts 370 * processor. When returning here, external interrupts
376 * are enabled. 371 * are enabled.
377 */ 372 */
378 cede_processor(); 373 cede_processor();
379 374
380 get_lppaca()->idle = 0; 375 get_lppaca()->idle = 0;
381 }
382} 376}
383 377
384/* 378/*
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 2ea4483fd722..be616ee0cf87 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,7 +16,6 @@
16#include <linux/thread_info.h> 16#include <linux/thread_info.h>
17#include <linux/irqflags.h> 17#include <linux/irqflags.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/cpuidle.h>
20#include <linux/atomic.h> 19#include <linux/atomic.h>
21#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
22#include <asm/smp.h> 21#include <asm/smp.h>
@@ -40,8 +39,7 @@ void arch_cpu_idle_dead(void)
40 39
41void arch_cpu_idle(void) 40void arch_cpu_idle(void)
42{ 41{
43 if (cpuidle_idle_call()) 42 sh_idle();
44 sh_idle();
45} 43}
46 44
47void __init select_idle_routine(void) 45void __init select_idle_routine(void)
diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index dd3bef4b9896..05710393959f 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -32,7 +32,6 @@
32 32
33DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 33DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
34extern cpumask_t cpu_core_map[NR_CPUS]; 34extern cpumask_t cpu_core_map[NR_CPUS];
35extern int sparc64_multi_core;
36 35
37extern void arch_send_call_function_single_ipi(int cpu); 36extern void arch_send_call_function_single_ipi(int cpu);
38extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); 37extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1754390a426f..a2d10fc64faf 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -42,8 +42,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
42#define topology_core_id(cpu) (cpu_data(cpu).core_id) 42#define topology_core_id(cpu) (cpu_data(cpu).core_id)
43#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 43#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
44#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) 44#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
45#define mc_capable() (sparc64_multi_core)
46#define smt_capable() (sparc64_multi_core)
47#endif /* CONFIG_SMP */ 45#endif /* CONFIG_SMP */
48 46
49extern cpumask_t cpu_core_map[NR_CPUS]; 47extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index b90bf23e3aab..a1a4400d4025 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -896,10 +896,6 @@ void mdesc_fill_in_cpu_data(cpumask_t *mask)
896 896
897 mdesc_iterate_over_cpus(fill_in_one_cpu, NULL, mask); 897 mdesc_iterate_over_cpus(fill_in_one_cpu, NULL, mask);
898 898
899#ifdef CONFIG_SMP
900 sparc64_multi_core = 1;
901#endif
902
903 hp = mdesc_grab(); 899 hp = mdesc_grab();
904 900
905 set_core_ids(hp); 901 set_core_ids(hp);
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index 6b39125eb927..9a690d39c01b 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -555,9 +555,6 @@ static void *fill_in_one_cpu(struct device_node *dp, int cpuid, int arg)
555 555
556 cpu_data(cpuid).core_id = portid + 1; 556 cpu_data(cpuid).core_id = portid + 1;
557 cpu_data(cpuid).proc_id = portid; 557 cpu_data(cpuid).proc_id = portid;
558#ifdef CONFIG_SMP
559 sparc64_multi_core = 1;
560#endif
561 } else { 558 } else {
562 cpu_data(cpuid).dcache_size = 559 cpu_data(cpuid).dcache_size =
563 of_getintprop_default(dp, "dcache-size", 16 * 1024); 560 of_getintprop_default(dp, "dcache-size", 16 * 1024);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b085311dcd0e..9781048161ab 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -53,8 +53,6 @@
53 53
54#include "cpumap.h" 54#include "cpumap.h"
55 55
56int sparc64_multi_core __read_mostly;
57
58DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; 56DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
59cpumask_t cpu_core_map[NR_CPUS] __read_mostly = 57cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
60 { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; 58 { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 1306d117967d..b28097e4c8c3 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -134,12 +134,6 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
134struct pci_bus; 134struct pci_bus;
135void x86_pci_root_bus_resources(int bus, struct list_head *resources); 135void x86_pci_root_bus_resources(int bus, struct list_head *resources);
136 136
137#ifdef CONFIG_SMP
138#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
139 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
140#define smt_capable() (smp_num_siblings > 1)
141#endif
142
143#ifdef CONFIG_NUMA 137#ifdef CONFIG_NUMA
144extern int get_mp_bus_to_node(int busnum); 138extern int get_mp_bus_to_node(int busnum);
145extern void set_mp_bus_to_node(int busnum, int node); 139extern void set_mp_bus_to_node(int busnum, int node);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b5..4505e2a950d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
298 */ 298 */
299void arch_cpu_idle(void) 299void arch_cpu_idle(void)
300{ 300{
301 if (cpuidle_idle_call()) 301 x86_idle();
302 x86_idle();
303 else
304 local_irq_enable();
305} 302}
306 303
307/* 304/*
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 78fd174c57e8..f48607cd2540 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -14,6 +14,7 @@
14 14
15#include <asm/machdep.h> 15#include <asm/machdep.h>
16#include <asm/firmware.h> 16#include <asm/firmware.h>
17#include <asm/runlatch.h>
17 18
18struct cpuidle_driver powernv_idle_driver = { 19struct cpuidle_driver powernv_idle_driver = {
19 .name = "powernv_idle", 20 .name = "powernv_idle",
@@ -30,12 +31,14 @@ static int snooze_loop(struct cpuidle_device *dev,
30 local_irq_enable(); 31 local_irq_enable();
31 set_thread_flag(TIF_POLLING_NRFLAG); 32 set_thread_flag(TIF_POLLING_NRFLAG);
32 33
34 ppc64_runlatch_off();
33 while (!need_resched()) { 35 while (!need_resched()) {
34 HMT_low(); 36 HMT_low();
35 HMT_very_low(); 37 HMT_very_low();
36 } 38 }
37 39
38 HMT_medium(); 40 HMT_medium();
41 ppc64_runlatch_on();
39 clear_thread_flag(TIF_POLLING_NRFLAG); 42 clear_thread_flag(TIF_POLLING_NRFLAG);
40 smp_mb(); 43 smp_mb();
41 return index; 44 return index;
@@ -45,7 +48,9 @@ static int nap_loop(struct cpuidle_device *dev,
45 struct cpuidle_driver *drv, 48 struct cpuidle_driver *drv,
46 int index) 49 int index)
47{ 50{
51 ppc64_runlatch_off();
48 power7_idle(); 52 power7_idle();
53 ppc64_runlatch_on();
49 return index; 54 return index;
50} 55}
51 56
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 7ab564aa0b1c..6f7b01956885 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -17,6 +17,7 @@
17#include <asm/reg.h> 17#include <asm/reg.h>
18#include <asm/machdep.h> 18#include <asm/machdep.h>
19#include <asm/firmware.h> 19#include <asm/firmware.h>
20#include <asm/runlatch.h>
20#include <asm/plpar_wrappers.h> 21#include <asm/plpar_wrappers.h>
21 22
22struct cpuidle_driver pseries_idle_driver = { 23struct cpuidle_driver pseries_idle_driver = {
@@ -29,6 +30,7 @@ static struct cpuidle_state *cpuidle_state_table;
29 30
30static inline void idle_loop_prolog(unsigned long *in_purr) 31static inline void idle_loop_prolog(unsigned long *in_purr)
31{ 32{
33 ppc64_runlatch_off();
32 *in_purr = mfspr(SPRN_PURR); 34 *in_purr = mfspr(SPRN_PURR);
33 /* 35 /*
34 * Indicate to the HV that we are idle. Now would be 36 * Indicate to the HV that we are idle. Now would be
@@ -45,6 +47,10 @@ static inline void idle_loop_epilog(unsigned long in_purr)
45 wait_cycles += mfspr(SPRN_PURR) - in_purr; 47 wait_cycles += mfspr(SPRN_PURR) - in_purr;
46 get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); 48 get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
47 get_lppaca()->idle = 0; 49 get_lppaca()->idle = 0;
50
51 if (irqs_disabled())
52 local_irq_enable();
53 ppc64_runlatch_on();
48} 54}
49 55
50static int snooze_loop(struct cpuidle_device *dev, 56static int snooze_loop(struct cpuidle_device *dev,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec1cd0b..825ed838d4b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
3 3
4#include <uapi/linux/sched.h> 4#include <uapi/linux/sched.h>
5 5
6#include <linux/sched/prio.h>
7
6 8
7struct sched_param { 9struct sched_param {
8 int sched_priority; 10 int sched_priority;
@@ -1077,6 +1079,7 @@ struct sched_entity {
1077#endif 1079#endif
1078 1080
1079#ifdef CONFIG_FAIR_GROUP_SCHED 1081#ifdef CONFIG_FAIR_GROUP_SCHED
1082 int depth;
1080 struct sched_entity *parent; 1083 struct sched_entity *parent;
1081 /* rq on which this entity is (to be) queued: */ 1084 /* rq on which this entity is (to be) queued: */
1082 struct cfs_rq *cfs_rq; 1085 struct cfs_rq *cfs_rq;
@@ -1460,6 +1463,9 @@ struct task_struct {
1460 struct mutex perf_event_mutex; 1463 struct mutex perf_event_mutex;
1461 struct list_head perf_event_list; 1464 struct list_head perf_event_list;
1462#endif 1465#endif
1466#ifdef CONFIG_DEBUG_PREEMPT
1467 unsigned long preempt_disable_ip;
1468#endif
1463#ifdef CONFIG_NUMA 1469#ifdef CONFIG_NUMA
1464 struct mempolicy *mempolicy; /* Protected by alloc_lock */ 1470 struct mempolicy *mempolicy; /* Protected by alloc_lock */
1465 short il_next; 1471 short il_next;
@@ -1470,9 +1476,10 @@ struct task_struct {
1470 unsigned int numa_scan_period; 1476 unsigned int numa_scan_period;
1471 unsigned int numa_scan_period_max; 1477 unsigned int numa_scan_period_max;
1472 int numa_preferred_nid; 1478 int numa_preferred_nid;
1473 int numa_migrate_deferred;
1474 unsigned long numa_migrate_retry; 1479 unsigned long numa_migrate_retry;
1475 u64 node_stamp; /* migration stamp */ 1480 u64 node_stamp; /* migration stamp */
1481 u64 last_task_numa_placement;
1482 u64 last_sum_exec_runtime;
1476 struct callback_head numa_work; 1483 struct callback_head numa_work;
1477 1484
1478 struct list_head numa_entry; 1485 struct list_head numa_entry;
@@ -1483,15 +1490,22 @@ struct task_struct {
1483 * Scheduling placement decisions are made based on the these counts. 1490 * Scheduling placement decisions are made based on the these counts.
1484 * The values remain static for the duration of a PTE scan 1491 * The values remain static for the duration of a PTE scan
1485 */ 1492 */
1486 unsigned long *numa_faults; 1493 unsigned long *numa_faults_memory;
1487 unsigned long total_numa_faults; 1494 unsigned long total_numa_faults;
1488 1495
1489 /* 1496 /*
1490 * numa_faults_buffer records faults per node during the current 1497 * numa_faults_buffer records faults per node during the current
1491 * scan window. When the scan completes, the counts in numa_faults 1498 * scan window. When the scan completes, the counts in
1492 * decay and these values are copied. 1499 * numa_faults_memory decay and these values are copied.
1500 */
1501 unsigned long *numa_faults_buffer_memory;
1502
1503 /*
1504 * Track the nodes the process was running on when a NUMA hinting
1505 * fault was incurred.
1493 */ 1506 */
1494 unsigned long *numa_faults_buffer; 1507 unsigned long *numa_faults_cpu;
1508 unsigned long *numa_faults_buffer_cpu;
1495 1509
1496 /* 1510 /*
1497 * numa_faults_locality tracks if faults recorded during the last 1511 * numa_faults_locality tracks if faults recorded during the last
@@ -1596,8 +1610,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
1596extern pid_t task_numa_group_id(struct task_struct *p); 1610extern pid_t task_numa_group_id(struct task_struct *p);
1597extern void set_numabalancing_state(bool enabled); 1611extern void set_numabalancing_state(bool enabled);
1598extern void task_numa_free(struct task_struct *p); 1612extern void task_numa_free(struct task_struct *p);
1599 1613extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
1600extern unsigned int sysctl_numa_balancing_migrate_deferred; 1614 int src_nid, int dst_cpu);
1601#else 1615#else
1602static inline void task_numa_fault(int last_node, int node, int pages, 1616static inline void task_numa_fault(int last_node, int node, int pages,
1603 int flags) 1617 int flags)
@@ -1613,6 +1627,11 @@ static inline void set_numabalancing_state(bool enabled)
1613static inline void task_numa_free(struct task_struct *p) 1627static inline void task_numa_free(struct task_struct *p)
1614{ 1628{
1615} 1629}
1630static inline bool should_numa_migrate_memory(struct task_struct *p,
1631 struct page *page, int src_nid, int dst_cpu)
1632{
1633 return true;
1634}
1616#endif 1635#endif
1617 1636
1618static inline struct pid *task_pid(struct task_struct *task) 1637static inline struct pid *task_pid(struct task_struct *task)
@@ -2080,7 +2099,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2080extern bool yield_to(struct task_struct *p, bool preempt); 2099extern bool yield_to(struct task_struct *p, bool preempt);
2081extern void set_user_nice(struct task_struct *p, long nice); 2100extern void set_user_nice(struct task_struct *p, long nice);
2082extern int task_prio(const struct task_struct *p); 2101extern int task_prio(const struct task_struct *p);
2083extern int task_nice(const struct task_struct *p); 2102/**
2103 * task_nice - return the nice value of a given task.
2104 * @p: the task in question.
2105 *
2106 * Return: The nice value [ -20 ... 0 ... 19 ].
2107 */
2108static inline int task_nice(const struct task_struct *p)
2109{
2110 return PRIO_TO_NICE((p)->static_prio);
2111}
2084extern int can_nice(const struct task_struct *p, const int nice); 2112extern int can_nice(const struct task_struct *p, const int nice);
2085extern int task_curr(const struct task_struct *p); 2113extern int task_curr(const struct task_struct *p);
2086extern int idle_cpu(int cpu); 2114extern int idle_cpu(int cpu);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
new file mode 100644
index 000000000000..ac322583c820
--- /dev/null
+++ b/include/linux/sched/prio.h
@@ -0,0 +1,44 @@
1#ifndef _SCHED_PRIO_H
2#define _SCHED_PRIO_H
3
4#define MAX_NICE 19
5#define MIN_NICE -20
6#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
7
8/*
9 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
10 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
11 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
12 * values are inverted: lower p->prio value means higher priority.
13 *
14 * The MAX_USER_RT_PRIO value allows the actual maximum
15 * RT priority to be separate from the value exported to
16 * user-space. This allows kernel threads to set their
17 * priority to a value higher than any user task. Note:
18 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
19 */
20
21#define MAX_USER_RT_PRIO 100
22#define MAX_RT_PRIO MAX_USER_RT_PRIO
23
24#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
25#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
26
27/*
28 * Convert user-nice values [ -20 ... 0 ... 19 ]
29 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
30 * and back.
31 */
32#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
33#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 34e4ebea8fce..6341f5be6e24 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,24 +1,7 @@
1#ifndef _SCHED_RT_H 1#ifndef _SCHED_RT_H
2#define _SCHED_RT_H 2#define _SCHED_RT_H
3 3
4/* 4#include <linux/sched/prio.h>
5 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
6 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
7 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
8 * values are inverted: lower p->prio value means higher priority.
9 *
10 * The MAX_USER_RT_PRIO value allows the actual maximum
11 * RT priority to be separate from the value exported to
12 * user-space. This allows kernel threads to set their
13 * priority to a value higher than any user task. Note:
14 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
15 */
16
17#define MAX_USER_RT_PRIO 100
18#define MAX_RT_PRIO MAX_USER_RT_PRIO
19
20#define MAX_PRIO (MAX_RT_PRIO + 40)
21#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
22 5
23static inline int rt_prio(int prio) 6static inline int rt_prio(int prio)
24{ 7{
@@ -35,6 +18,7 @@ static inline int rt_task(struct task_struct *p)
35#ifdef CONFIG_RT_MUTEXES 18#ifdef CONFIG_RT_MUTEXES
36extern int rt_mutex_getprio(struct task_struct *p); 19extern int rt_mutex_getprio(struct task_struct *p);
37extern void rt_mutex_setprio(struct task_struct *p, int prio); 20extern void rt_mutex_setprio(struct task_struct *p, int prio);
21extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
38extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); 22extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
39extern void rt_mutex_adjust_pi(struct task_struct *p); 23extern void rt_mutex_adjust_pi(struct task_struct *p);
40static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 24static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
@@ -46,6 +30,12 @@ static inline int rt_mutex_getprio(struct task_struct *p)
46{ 30{
47 return p->normal_prio; 31 return p->normal_prio;
48} 32}
33
34static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
35{
36 return 0;
37}
38
49static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) 39static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
50{ 40{
51 return NULL; 41 return NULL;
diff --git a/kernel/Makefile b/kernel/Makefile
index 5c0e7666811d..4fd847488b76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
22obj-y += locking/ 22obj-y += locking/
23obj-y += power/ 23obj-y += power/
24obj-y += printk/ 24obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 25obj-y += irq/
27obj-y += rcu/ 26obj-y += rcu/
28 27
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..332688e5e7b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
237 WARN_ON(atomic_read(&tsk->usage)); 237 WARN_ON(atomic_read(&tsk->usage));
238 WARN_ON(tsk == current); 238 WARN_ON(tsk == current);
239 239
240 task_numa_free(tsk);
240 security_task_free(tsk); 241 security_task_free(tsk);
241 exit_creds(tsk); 242 exit_creds(tsk);
242 delayacct_tsk_free(tsk); 243 delayacct_tsk_free(tsk);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
213} 213}
214 214
215/* 215/*
216 * Called by sched_setscheduler() to check whether the priority change
217 * is overruled by a possible priority boosting.
218 */
219int rt_mutex_check_prio(struct task_struct *task, int newprio)
220{
221 if (!task_has_pi_waiters(task))
222 return 0;
223
224 return task_top_pi_waiter(task)->task->prio <= newprio;
225}
226
227/*
216 * Adjust the priority of a task, after its pi_waiters got modified. 228 * Adjust the priority of a task, after its pi_waiters got modified.
217 * 229 *
218 * This can be both boosting and unboosting. task->pi_lock must be held. 230 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f59d48597dde..bd30bc61bc05 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -696,7 +696,7 @@ rcu_torture_writer(void *arg)
696 static DEFINE_TORTURE_RANDOM(rand); 696 static DEFINE_TORTURE_RANDOM(rand);
697 697
698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
699 set_user_nice(current, 19); 699 set_user_nice(current, MAX_NICE);
700 700
701 do { 701 do {
702 schedule_timeout_uninterruptible(1); 702 schedule_timeout_uninterruptible(1);
@@ -759,7 +759,7 @@ rcu_torture_fakewriter(void *arg)
759 DEFINE_TORTURE_RANDOM(rand); 759 DEFINE_TORTURE_RANDOM(rand);
760 760
761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); 761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
762 set_user_nice(current, 19); 762 set_user_nice(current, MAX_NICE);
763 763
764 do { 764 do {
765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); 765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
@@ -872,7 +872,7 @@ rcu_torture_reader(void *arg)
872 unsigned long long ts; 872 unsigned long long ts;
873 873
874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); 874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
875 set_user_nice(current, 19); 875 set_user_nice(current, MAX_NICE);
876 if (irqreader && cur_ops->irq_capable) 876 if (irqreader && cur_ops->irq_capable)
877 setup_timer_on_stack(&t, rcu_torture_timer, 0); 877 setup_timer_on_stack(&t, rcu_torture_timer, 0);
878 878
@@ -1161,7 +1161,7 @@ static int rcu_torture_barrier_cbs(void *arg)
1161 1161
1162 init_rcu_head_on_stack(&rcu); 1162 init_rcu_head_on_stack(&rcu);
1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started"); 1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
1164 set_user_nice(current, 19); 1164 set_user_nice(current, MAX_NICE);
1165 do { 1165 do {
1166 wait_event(barrier_cbs_wq[myid], 1166 wait_event(barrier_cbs_wq[myid],
1167 (newphase = 1167 (newphase =
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
203 struct autogroup *ag; 203 struct autogroup *ag;
204 int err; 204 int err;
205 205
206 if (nice < -20 || nice > 19) 206 if (nice < MIN_NICE || nice > MAX_NICE)
207 return -EINVAL; 207 return -EINVAL;
208 208
209 err = security_task_setnice(current, nice); 209 err = security_task_setnice(current, nice);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5c6635b806c..ae365aaa8181 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1748 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1749 p->numa_faults_buffer_memory = NULL;
1750 p->last_task_numa_placement = 0;
1751 p->last_sum_exec_runtime = 0;
1750 1752
1751 INIT_LIST_HEAD(&p->numa_entry); 1753 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1754 p->numa_group = NULL;
@@ -2149,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2149 if (mm) 2151 if (mm)
2150 mmdrop(mm); 2152 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2153 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead) 2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev); 2155 prev->sched_class->task_dead(prev);
2156 2156
@@ -2167,13 +2167,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2167
2168#ifdef CONFIG_SMP 2168#ifdef CONFIG_SMP
2169 2169
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2170/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2171static inline void post_schedule(struct rq *rq)
2179{ 2172{
@@ -2191,10 +2184,6 @@ static inline void post_schedule(struct rq *rq)
2191 2184
2192#else 2185#else
2193 2186
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2187static inline void post_schedule(struct rq *rq)
2199{ 2188{
2200} 2189}
@@ -2510,8 +2499,13 @@ void __kprobes preempt_count_add(int val)
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2499 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10); 2500 PREEMPT_MASK - 10);
2512#endif 2501#endif
2513 if (preempt_count() == val) 2502 if (preempt_count() == val) {
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2503 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2504#ifdef CONFIG_DEBUG_PREEMPT
2505 current->preempt_disable_ip = ip;
2506#endif
2507 trace_preempt_off(CALLER_ADDR0, ip);
2508 }
2515} 2509}
2516EXPORT_SYMBOL(preempt_count_add); 2510EXPORT_SYMBOL(preempt_count_add);
2517 2511
@@ -2554,6 +2548,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2554 print_modules(); 2548 print_modules();
2555 if (irqs_disabled()) 2549 if (irqs_disabled())
2556 print_irqtrace_events(prev); 2550 print_irqtrace_events(prev);
2551#ifdef CONFIG_DEBUG_PREEMPT
2552 if (in_atomic_preempt_off()) {
2553 pr_err("Preemption disabled at:");
2554 print_ip_sym(current->preempt_disable_ip);
2555 pr_cont("\n");
2556 }
2557#endif
2557 dump_stack(); 2558 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2559 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559} 2560}
@@ -2577,36 +2578,34 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2578 schedstat_inc(this_rq(), sched_count);
2578} 2579}
2579 2580
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2581/*
2588 * Pick up the highest-prio task: 2582 * Pick up the highest-prio task:
2589 */ 2583 */
2590static inline struct task_struct * 2584static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2585pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2586{
2593 const struct sched_class *class; 2587 const struct sched_class *class = &fair_sched_class;
2594 struct task_struct *p; 2588 struct task_struct *p;
2595 2589
2596 /* 2590 /*
2597 * Optimization: we know that if all tasks are in 2591 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2592 * the fair class we can call that function directly:
2599 */ 2593 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2594 if (likely(prev->sched_class == class &&
2601 p = fair_sched_class.pick_next_task(rq); 2595 rq->nr_running == rq->cfs.h_nr_running)) {
2602 if (likely(p)) 2596 p = fair_sched_class.pick_next_task(rq, prev);
2597 if (likely(p && p != RETRY_TASK))
2603 return p; 2598 return p;
2604 } 2599 }
2605 2600
2601again:
2606 for_each_class(class) { 2602 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2603 p = class->pick_next_task(rq, prev);
2608 if (p) 2604 if (p) {
2605 if (unlikely(p == RETRY_TASK))
2606 goto again;
2609 return p; 2607 return p;
2608 }
2610 } 2609 }
2611 2610
2612 BUG(); /* the idle class will always have a runnable task */ 2611 BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2699,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2699 switch_count = &prev->nvcsw;
2701 } 2700 }
2702 2701
2703 pre_schedule(rq, prev); 2702 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2703 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2704
2708 put_prev_task(rq, prev); 2705 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2706 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2707 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2708 rq->skip_clock_update = 0;
@@ -2908,7 +2904,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
2908 * This function changes the 'effective' priority of a task. It does 2904 * This function changes the 'effective' priority of a task. It does
2909 * not touch ->normal_prio like __setscheduler(). 2905 * not touch ->normal_prio like __setscheduler().
2910 * 2906 *
2911 * Used by the rt_mutex code to implement priority inheritance logic. 2907 * Used by the rt_mutex code to implement priority inheritance
2908 * logic. Call site only calls if the priority of the task changed.
2912 */ 2909 */
2913void rt_mutex_setprio(struct task_struct *p, int prio) 2910void rt_mutex_setprio(struct task_struct *p, int prio)
2914{ 2911{
@@ -2998,7 +2995,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2995 unsigned long flags;
2999 struct rq *rq; 2996 struct rq *rq;
3000 2997
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2998 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3002 return; 2999 return;
3003 /* 3000 /*
3004 * We have to be careful, if called from sys_setpriority(), 3001 * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3073,11 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3073 if (increment > 40)
3077 increment = 40; 3074 increment = 40;
3078 3075
3079 nice = TASK_NICE(current) + increment; 3076 nice = task_nice(current) + increment;
3080 if (nice < -20) 3077 if (nice < MIN_NICE)
3081 nice = -20; 3078 nice = MIN_NICE;
3082 if (nice > 19) 3079 if (nice > MAX_NICE)
3083 nice = 19; 3080 nice = MAX_NICE;
3084 3081
3085 if (increment < 0 && !can_nice(current, nice)) 3082 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM; 3083 return -EPERM;
@@ -3109,18 +3106,6 @@ int task_prio(const struct task_struct *p)
3109} 3106}
3110 3107
3111/** 3108/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3109 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3110 * @cpu: the processor in question.
3126 * 3111 *
@@ -3189,9 +3174,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3189 dl_se->dl_new = 1; 3174 dl_se->dl_new = 1;
3190} 3175}
3191 3176
3192/* Actually do priority change: must hold pi & rq lock. */ 3177static void __setscheduler_params(struct task_struct *p,
3193static void __setscheduler(struct rq *rq, struct task_struct *p, 3178 const struct sched_attr *attr)
3194 const struct sched_attr *attr)
3195{ 3179{
3196 int policy = attr->sched_policy; 3180 int policy = attr->sched_policy;
3197 3181
@@ -3211,9 +3195,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3211 * getparam()/getattr() don't report silly values for !rt tasks. 3195 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */ 3196 */
3213 p->rt_priority = attr->sched_priority; 3197 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p); 3198 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p); 3199 set_load_weight(p);
3200}
3201
3202/* Actually do priority change: must hold pi & rq lock. */
3203static void __setscheduler(struct rq *rq, struct task_struct *p,
3204 const struct sched_attr *attr)
3205{
3206 __setscheduler_params(p, attr);
3207
3208 /*
3209 * If we get here, there was no pi waiters boosting the
3210 * task. It is safe to use the normal prio.
3211 */
3212 p->prio = normal_prio(p);
3217 3213
3218 if (dl_prio(p->prio)) 3214 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class; 3215 p->sched_class = &dl_sched_class;
@@ -3221,8 +3217,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3221 p->sched_class = &rt_sched_class; 3217 p->sched_class = &rt_sched_class;
3222 else 3218 else
3223 p->sched_class = &fair_sched_class; 3219 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226} 3220}
3227 3221
3228static void 3222static void
@@ -3275,6 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr, 3269 const struct sched_attr *attr,
3276 bool user) 3270 bool user)
3277{ 3271{
3272 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3273 MAX_RT_PRIO - 1 - attr->sched_priority;
3278 int retval, oldprio, oldpolicy = -1, on_rq, running; 3274 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy; 3275 int policy = attr->sched_policy;
3280 unsigned long flags; 3276 unsigned long flags;
@@ -3319,7 +3315,7 @@ recheck:
3319 */ 3315 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3316 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3317 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3318 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3319 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3320 return -EPERM;
3325 } 3321 }
@@ -3352,7 +3348,7 @@ recheck:
3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3348 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3353 */ 3349 */
3354 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3350 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3355 if (!can_nice(p, TASK_NICE(p))) 3351 if (!can_nice(p, task_nice(p)))
3356 return -EPERM; 3352 return -EPERM;
3357 } 3353 }
3358 3354
@@ -3389,16 +3385,18 @@ recheck:
3389 } 3385 }
3390 3386
3391 /* 3387 /*
3392 * If not changing anything there's no need to proceed further: 3388 * If not changing anything there's no need to proceed further,
3389 * but store a possible modification of reset_on_fork.
3393 */ 3390 */
3394 if (unlikely(policy == p->policy)) { 3391 if (unlikely(policy == p->policy)) {
3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3392 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3396 goto change; 3393 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3394 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change; 3395 goto change;
3399 if (dl_policy(policy)) 3396 if (dl_policy(policy))
3400 goto change; 3397 goto change;
3401 3398
3399 p->sched_reset_on_fork = reset_on_fork;
3402 task_rq_unlock(rq, p, &flags); 3400 task_rq_unlock(rq, p, &flags);
3403 return 0; 3401 return 0;
3404 } 3402 }
@@ -3452,6 +3450,24 @@ change:
3452 return -EBUSY; 3450 return -EBUSY;
3453 } 3451 }
3454 3452
3453 p->sched_reset_on_fork = reset_on_fork;
3454 oldprio = p->prio;
3455
3456 /*
3457 * Special case for priority boosted tasks.
3458 *
3459 * If the new priority is lower or equal (user space view)
3460 * than the current (boosted) priority, we just store the new
3461 * normal parameters and do not touch the scheduler class and
3462 * the runqueue. This will be done when the task deboost
3463 * itself.
3464 */
3465 if (rt_mutex_check_prio(p, newprio)) {
3466 __setscheduler_params(p, attr);
3467 task_rq_unlock(rq, p, &flags);
3468 return 0;
3469 }
3470
3455 on_rq = p->on_rq; 3471 on_rq = p->on_rq;
3456 running = task_current(rq, p); 3472 running = task_current(rq, p);
3457 if (on_rq) 3473 if (on_rq)
@@ -3459,16 +3475,18 @@ change:
3459 if (running) 3475 if (running)
3460 p->sched_class->put_prev_task(rq, p); 3476 p->sched_class->put_prev_task(rq, p);
3461 3477
3462 p->sched_reset_on_fork = reset_on_fork;
3463
3464 oldprio = p->prio;
3465 prev_class = p->sched_class; 3478 prev_class = p->sched_class;
3466 __setscheduler(rq, p, attr); 3479 __setscheduler(rq, p, attr);
3467 3480
3468 if (running) 3481 if (running)
3469 p->sched_class->set_curr_task(rq); 3482 p->sched_class->set_curr_task(rq);
3470 if (on_rq) 3483 if (on_rq) {
3471 enqueue_task(rq, p, 0); 3484 /*
3485 * We enqueue to tail when the priority of a task is
3486 * increased (user space view).
3487 */
3488 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3489 }
3472 3490
3473 check_class_changed(rq, p, prev_class, oldprio); 3491 check_class_changed(rq, p, prev_class, oldprio);
3474 task_rq_unlock(rq, p, &flags); 3492 task_rq_unlock(rq, p, &flags);
@@ -3624,7 +3642,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3624 * XXX: do we want to be lenient like existing syscalls; or do we want 3642 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values? 3643 * to be strict and return an error on out-of-bounds values?
3626 */ 3644 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3645 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3628 3646
3629out: 3647out:
3630 return ret; 3648 return ret;
@@ -3845,7 +3863,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3845 else if (task_has_rt_policy(p)) 3863 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority; 3864 attr.sched_priority = p->rt_priority;
3847 else 3865 else
3848 attr.sched_nice = TASK_NICE(p); 3866 attr.sched_nice = task_nice(p);
3849 3867
3850 rcu_read_unlock(); 3868 rcu_read_unlock();
3851 3869
@@ -4483,6 +4501,7 @@ void init_idle(struct task_struct *idle, int cpu)
4483 rcu_read_unlock(); 4501 rcu_read_unlock();
4484 4502
4485 rq->curr = rq->idle = idle; 4503 rq->curr = rq->idle = idle;
4504 idle->on_rq = 1;
4486#if defined(CONFIG_SMP) 4505#if defined(CONFIG_SMP)
4487 idle->on_cpu = 1; 4506 idle->on_cpu = 1;
4488#endif 4507#endif
@@ -4721,6 +4740,22 @@ static void calc_load_migrate(struct rq *rq)
4721 atomic_long_add(delta, &calc_load_tasks); 4740 atomic_long_add(delta, &calc_load_tasks);
4722} 4741}
4723 4742
4743static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4744{
4745}
4746
4747static const struct sched_class fake_sched_class = {
4748 .put_prev_task = put_prev_task_fake,
4749};
4750
4751static struct task_struct fake_task = {
4752 /*
4753 * Avoid pull_{rt,dl}_task()
4754 */
4755 .prio = MAX_PRIO + 1,
4756 .sched_class = &fake_sched_class,
4757};
4758
4724/* 4759/*
4725 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4760 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4726 * try_to_wake_up()->select_task_rq(). 4761 * try_to_wake_up()->select_task_rq().
@@ -4761,7 +4796,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4761 if (rq->nr_running == 1) 4796 if (rq->nr_running == 1)
4762 break; 4797 break;
4763 4798
4764 next = pick_next_task(rq); 4799 next = pick_next_task(rq, &fake_task);
4765 BUG_ON(!next); 4800 BUG_ON(!next);
4766 next->sched_class->put_prev_task(rq, next); 4801 next->sched_class->put_prev_task(rq, next);
4767 4802
@@ -4851,7 +4886,7 @@ set_table_entry(struct ctl_table *entry,
4851static struct ctl_table * 4886static struct ctl_table *
4852sd_alloc_ctl_domain_table(struct sched_domain *sd) 4887sd_alloc_ctl_domain_table(struct sched_domain *sd)
4853{ 4888{
4854 struct ctl_table *table = sd_alloc_ctl_entry(13); 4889 struct ctl_table *table = sd_alloc_ctl_entry(14);
4855 4890
4856 if (table == NULL) 4891 if (table == NULL)
4857 return NULL; 4892 return NULL;
@@ -4879,9 +4914,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4879 sizeof(int), 0644, proc_dointvec_minmax, false); 4914 sizeof(int), 0644, proc_dointvec_minmax, false);
4880 set_table_entry(&table[10], "flags", &sd->flags, 4915 set_table_entry(&table[10], "flags", &sd->flags,
4881 sizeof(int), 0644, proc_dointvec_minmax, false); 4916 sizeof(int), 0644, proc_dointvec_minmax, false);
4882 set_table_entry(&table[11], "name", sd->name, 4917 set_table_entry(&table[11], "max_newidle_lb_cost",
4918 &sd->max_newidle_lb_cost,
4919 sizeof(long), 0644, proc_doulongvec_minmax, false);
4920 set_table_entry(&table[12], "name", sd->name,
4883 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4921 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4884 /* &table[12] is terminator */ 4922 /* &table[13] is terminator */
4885 4923
4886 return table; 4924 return table;
4887} 4925}
@@ -6858,7 +6896,6 @@ void __init sched_init(void)
6858 6896
6859 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6897 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6860#ifdef CONFIG_RT_GROUP_SCHED 6898#ifdef CONFIG_RT_GROUP_SCHED
6861 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6862 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6899 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6863#endif 6900#endif
6864 6901
@@ -6947,7 +6984,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6947 static unsigned long prev_jiffy; /* ratelimiting */ 6984 static unsigned long prev_jiffy; /* ratelimiting */
6948 6985
6949 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6986 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6950 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6987 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6988 !is_idle_task(current)) ||
6951 system_state != SYSTEM_RUNNING || oops_in_progress) 6989 system_state != SYSTEM_RUNNING || oops_in_progress)
6952 return; 6990 return;
6953 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6991 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6965,6 +7003,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6965 debug_show_held_locks(current); 7003 debug_show_held_locks(current);
6966 if (irqs_disabled()) 7004 if (irqs_disabled())
6967 print_irqtrace_events(current); 7005 print_irqtrace_events(current);
7006#ifdef CONFIG_DEBUG_PREEMPT
7007 if (!preempt_count_equals(preempt_offset)) {
7008 pr_err("Preemption disabled at:");
7009 print_ip_sym(current->preempt_disable_ip);
7010 pr_cont("\n");
7011 }
7012#endif
6968 dump_stack(); 7013 dump_stack();
6969} 7014}
6970EXPORT_SYMBOL(__might_sleep); 7015EXPORT_SYMBOL(__might_sleep);
@@ -7018,7 +7063,7 @@ void normalize_rt_tasks(void)
7018 * Renice negative nice level userspace 7063 * Renice negative nice level userspace
7019 * tasks back to 0: 7064 * tasks back to 0:
7020 */ 7065 */
7021 if (TASK_NICE(p) < 0 && p->mm) 7066 if (task_nice(p) < 0 && p->mm)
7022 set_user_nice(p, 0); 7067 set_user_nice(p, 0);
7023 continue; 7068 continue;
7024 } 7069 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6e79b3faa4cd..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -210,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
210 210
211static int push_dl_task(struct rq *rq); 211static int push_dl_task(struct rq *rq);
212 212
213static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
214{
215 return dl_task(prev);
216}
217
218static inline void set_post_schedule(struct rq *rq)
219{
220 rq->post_schedule = has_pushable_dl_tasks(rq);
221}
222
213#else 223#else
214 224
215static inline 225static inline
@@ -232,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
232{ 242{
233} 243}
234 244
245static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
246{
247 return false;
248}
249
250static inline int pull_dl_task(struct rq *rq)
251{
252 return 0;
253}
254
255static inline void set_post_schedule(struct rq *rq)
256{
257}
235#endif /* CONFIG_SMP */ 258#endif /* CONFIG_SMP */
236 259
237static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 260static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -586,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
586 * approach need further study. 609 * approach need further study.
587 */ 610 */
588 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 611 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
589 if (unlikely((s64)delta_exec < 0)) 612 if (unlikely((s64)delta_exec <= 0))
590 delta_exec = 0; 613 return;
591 614
592 schedstat_set(curr->se.statistics.exec_max, 615 schedstat_set(curr->se.statistics.exec_max,
593 max(curr->se.statistics.exec_max, delta_exec)); 616 max(curr->se.statistics.exec_max, delta_exec));
@@ -942,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
942 resched_task(rq->curr); 965 resched_task(rq->curr);
943} 966}
944 967
968static int pull_dl_task(struct rq *this_rq);
969
945#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
946 971
947/* 972/*
@@ -988,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
988 return rb_entry(left, struct sched_dl_entity, rb_node); 1013 return rb_entry(left, struct sched_dl_entity, rb_node);
989} 1014}
990 1015
991struct task_struct *pick_next_task_dl(struct rq *rq) 1016struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
992{ 1017{
993 struct sched_dl_entity *dl_se; 1018 struct sched_dl_entity *dl_se;
994 struct task_struct *p; 1019 struct task_struct *p;
@@ -996,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
996 1021
997 dl_rq = &rq->dl; 1022 dl_rq = &rq->dl;
998 1023
1024 if (need_pull_dl_task(rq, prev))
1025 pull_dl_task(rq);
1026 /*
1027 * When prev is DL, we may throttle it in put_prev_task().
1028 * So, we update time before we check for dl_nr_running.
1029 */
1030 if (prev->sched_class == &dl_sched_class)
1031 update_curr_dl(rq);
1032
999 if (unlikely(!dl_rq->dl_nr_running)) 1033 if (unlikely(!dl_rq->dl_nr_running))
1000 return NULL; 1034 return NULL;
1001 1035
1036 put_prev_task(rq, prev);
1037
1002 dl_se = pick_next_dl_entity(rq, dl_rq); 1038 dl_se = pick_next_dl_entity(rq, dl_rq);
1003 BUG_ON(!dl_se); 1039 BUG_ON(!dl_se);
1004 1040
@@ -1013,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
1013 start_hrtick_dl(rq, p); 1049 start_hrtick_dl(rq, p);
1014#endif 1050#endif
1015 1051
1016#ifdef CONFIG_SMP 1052 set_post_schedule(rq);
1017 rq->post_schedule = has_pushable_dl_tasks(rq);
1018#endif /* CONFIG_SMP */
1019 1053
1020 return p; 1054 return p;
1021} 1055}
@@ -1424,13 +1458,6 @@ skip:
1424 return ret; 1458 return ret;
1425} 1459}
1426 1460
1427static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1428{
1429 /* Try to pull other tasks here */
1430 if (dl_task(prev))
1431 pull_dl_task(rq);
1432}
1433
1434static void post_schedule_dl(struct rq *rq) 1461static void post_schedule_dl(struct rq *rq)
1435{ 1462{
1436 push_dl_tasks(rq); 1463 push_dl_tasks(rq);
@@ -1558,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1558 if (unlikely(p->dl.dl_throttled)) 1585 if (unlikely(p->dl.dl_throttled))
1559 return; 1586 return;
1560 1587
1561 if (p->on_rq || rq->curr != p) { 1588 if (p->on_rq && rq->curr != p) {
1562#ifdef CONFIG_SMP 1589#ifdef CONFIG_SMP
1563 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1590 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1564 /* Only reschedule if pushing failed */ 1591 /* Only reschedule if pushing failed */
@@ -1623,7 +1650,6 @@ const struct sched_class dl_sched_class = {
1623 .set_cpus_allowed = set_cpus_allowed_dl, 1650 .set_cpus_allowed = set_cpus_allowed_dl,
1624 .rq_online = rq_online_dl, 1651 .rq_online = rq_online_dl,
1625 .rq_offline = rq_offline_dl, 1652 .rq_offline = rq_offline_dl,
1626 .pre_schedule = pre_schedule_dl,
1627 .post_schedule = post_schedule_dl, 1653 .post_schedule = post_schedule_dl,
1628 .task_woken = task_woken_dl, 1654 .task_woken = task_woken_dl,
1629#endif 1655#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
321 P(sched_goidle); 321 P(sched_goidle);
322#ifdef CONFIG_SMP 322#ifdef CONFIG_SMP
323 P64(avg_idle); 323 P64(avg_idle);
324 P64(max_idle_balance_cost);
324#endif 325#endif
325 326
326 P(ttwu_count); 327 P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 534 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 535 int cpu_current, home_node;
535 536
536 if (p->numa_faults) 537 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 538 nr_faults = p->numa_faults_memory[2*node + i];
538 539
539 cpu_current = !i ? (task_node(p) == node) : 540 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 541 (pol && node_isset(node, pol->v.nodes));
541 542
542 home_node = (p->numa_preferred_nid == node); 543 home_node = (p->numa_preferred_nid == node);
543 544
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 545 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 546 i, node, cpu_current, home_node, nr_faults);
546 } 547 }
547 } 548 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b4c4f320130..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -2219,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2219 se->avg.load_avg_contrib >>= NICE_0_SHIFT; 2376 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2220 } 2377 }
2221} 2378}
2222#else 2379
2380static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2381{
2382 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2383 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2384}
2385#else /* CONFIG_FAIR_GROUP_SCHED */
2223static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, 2386static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2224 int force_update) {} 2387 int force_update) {}
2225static inline void __update_tg_runnable_avg(struct sched_avg *sa, 2388static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2226 struct cfs_rq *cfs_rq) {} 2389 struct cfs_rq *cfs_rq) {}
2227static inline void __update_group_entity_contrib(struct sched_entity *se) {} 2390static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2228#endif 2391static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2392#endif /* CONFIG_FAIR_GROUP_SCHED */
2229 2393
2230static inline void __update_task_entity_contrib(struct sched_entity *se) 2394static inline void __update_task_entity_contrib(struct sched_entity *se)
2231{ 2395{
@@ -2323,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2323 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); 2487 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2324} 2488}
2325 2489
2326static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2327{
2328 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2329 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2330}
2331
2332/* Add the load generated by se into cfs_rq's child load-average */ 2490/* Add the load generated by se into cfs_rq's child load-average */
2333static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2491static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2334 struct sched_entity *se, 2492 struct sched_entity *se,
@@ -2416,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
2416 update_rq_runnable_avg(this_rq, 0); 2574 update_rq_runnable_avg(this_rq, 0);
2417} 2575}
2418 2576
2419#else 2577static int idle_balance(struct rq *this_rq);
2578
2579#else /* CONFIG_SMP */
2580
2420static inline void update_entity_load_avg(struct sched_entity *se, 2581static inline void update_entity_load_avg(struct sched_entity *se,
2421 int update_cfs_rq) {} 2582 int update_cfs_rq) {}
2422static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2583static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2428,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2428 int sleep) {} 2589 int sleep) {}
2429static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2590static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2430 int force_update) {} 2591 int force_update) {}
2431#endif 2592
2593static inline int idle_balance(struct rq *rq)
2594{
2595 return 0;
2596}
2597
2598#endif /* CONFIG_SMP */
2432 2599
2433static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2600static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2434{ 2601{
@@ -2578,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2578{ 2745{
2579 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2580 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2581 if (cfs_rq->last == se) 2748 if (cfs_rq->last != se)
2582 cfs_rq->last = NULL;
2583 else
2584 break; 2749 break;
2750
2751 cfs_rq->last = NULL;
2585 } 2752 }
2586} 2753}
2587 2754
@@ -2589,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2589{ 2756{
2590 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2591 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2592 if (cfs_rq->next == se) 2759 if (cfs_rq->next != se)
2593 cfs_rq->next = NULL;
2594 else
2595 break; 2760 break;
2761
2762 cfs_rq->next = NULL;
2596 } 2763 }
2597} 2764}
2598 2765
@@ -2600,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2600{ 2767{
2601 for_each_sched_entity(se) { 2768 for_each_sched_entity(se) {
2602 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2769 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2603 if (cfs_rq->skip == se) 2770 if (cfs_rq->skip != se)
2604 cfs_rq->skip = NULL;
2605 else
2606 break; 2771 break;
2772
2773 cfs_rq->skip = NULL;
2607 } 2774 }
2608} 2775}
2609 2776
@@ -2746,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2746 * 3) pick the "last" process, for cache locality 2913 * 3) pick the "last" process, for cache locality
2747 * 4) do not run the "skip" process, if something else is available 2914 * 4) do not run the "skip" process, if something else is available
2748 */ 2915 */
2749static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2916static struct sched_entity *
2917pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2750{ 2918{
2751 struct sched_entity *se = __pick_first_entity(cfs_rq); 2919 struct sched_entity *left = __pick_first_entity(cfs_rq);
2752 struct sched_entity *left = se; 2920 struct sched_entity *se;
2921
2922 /*
2923 * If curr is set we have to see if its left of the leftmost entity
2924 * still in the tree, provided there was anything in the tree at all.
2925 */
2926 if (!left || (curr && entity_before(curr, left)))
2927 left = curr;
2928
2929 se = left; /* ideally we run the leftmost entity */
2753 2930
2754 /* 2931 /*
2755 * Avoid running the skip buddy, if running something else can 2932 * Avoid running the skip buddy, if running something else can
2756 * be done without getting too unfair. 2933 * be done without getting too unfair.
2757 */ 2934 */
2758 if (cfs_rq->skip == se) { 2935 if (cfs_rq->skip == se) {
2759 struct sched_entity *second = __pick_next_entity(se); 2936 struct sched_entity *second;
2937
2938 if (se == curr) {
2939 second = __pick_first_entity(cfs_rq);
2940 } else {
2941 second = __pick_next_entity(se);
2942 if (!second || (curr && entity_before(curr, second)))
2943 second = curr;
2944 }
2945
2760 if (second && wakeup_preempt_entity(second, left) < 1) 2946 if (second && wakeup_preempt_entity(second, left) < 1)
2761 se = second; 2947 se = second;
2762 } 2948 }
@@ -2778,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2778 return se; 2964 return se;
2779} 2965}
2780 2966
2781static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2967static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2782 2968
2783static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2969static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2784{ 2970{
@@ -3433,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3433} 3619}
3434 3620
3435/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3621/* conditionally throttle active cfs_rq's from put_prev_entity() */
3436static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3622static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3437{ 3623{
3438 if (!cfs_bandwidth_used()) 3624 if (!cfs_bandwidth_used())
3439 return; 3625 return false;
3440 3626
3441 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3627 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3442 return; 3628 return false;
3443 3629
3444 /* 3630 /*
3445 * it's possible for a throttled entity to be forced into a running 3631 * it's possible for a throttled entity to be forced into a running
3446 * state (e.g. set_curr_task), in this case we're finished. 3632 * state (e.g. set_curr_task), in this case we're finished.
3447 */ 3633 */
3448 if (cfs_rq_throttled(cfs_rq)) 3634 if (cfs_rq_throttled(cfs_rq))
3449 return; 3635 return true;
3450 3636
3451 throttle_cfs_rq(cfs_rq); 3637 throttle_cfs_rq(cfs_rq);
3638 return true;
3452} 3639}
3453 3640
3454static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3641static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3558,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3558} 3745}
3559 3746
3560static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3747static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3561static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3748static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3562static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3749static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3563static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3750static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3564 3751
@@ -4213,13 +4400,14 @@ done:
4213} 4400}
4214 4401
4215/* 4402/*
4216 * sched_balance_self: balance the current task (running on cpu) in domains 4403 * select_task_rq_fair: Select target runqueue for the waking task in domains
4217 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 4404 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4218 * SD_BALANCE_EXEC. 4405 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4219 * 4406 *
4220 * Balance, ie. select the least loaded group. 4407 * Balances load by selecting the idlest cpu in the idlest group, or under
4408 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4221 * 4409 *
4222 * Returns the target CPU number, or the same CPU if no balancing is needed. 4410 * Returns the target cpu number.
4223 * 4411 *
4224 * preempt must be disabled. 4412 * preempt must be disabled.
4225 */ 4413 */
@@ -4494,26 +4682,124 @@ preempt:
4494 set_last_buddy(se); 4682 set_last_buddy(se);
4495} 4683}
4496 4684
4497static struct task_struct *pick_next_task_fair(struct rq *rq) 4685static struct task_struct *
4686pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4498{ 4687{
4499 struct task_struct *p;
4500 struct cfs_rq *cfs_rq = &rq->cfs; 4688 struct cfs_rq *cfs_rq = &rq->cfs;
4501 struct sched_entity *se; 4689 struct sched_entity *se;
4690 struct task_struct *p;
4691 int new_tasks;
4502 4692
4693again:
4694#ifdef CONFIG_FAIR_GROUP_SCHED
4503 if (!cfs_rq->nr_running) 4695 if (!cfs_rq->nr_running)
4504 return NULL; 4696 goto idle;
4697
4698 if (prev->sched_class != &fair_sched_class)
4699 goto simple;
4700
4701 /*
4702 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4703 * likely that a next task is from the same cgroup as the current.
4704 *
4705 * Therefore attempt to avoid putting and setting the entire cgroup
4706 * hierarchy, only change the part that actually changes.
4707 */
4708
4709 do {
4710 struct sched_entity *curr = cfs_rq->curr;
4711
4712 /*
4713 * Since we got here without doing put_prev_entity() we also
4714 * have to consider cfs_rq->curr. If it is still a runnable
4715 * entity, update_curr() will update its vruntime, otherwise
4716 * forget we've ever seen it.
4717 */
4718 if (curr && curr->on_rq)
4719 update_curr(cfs_rq);
4720 else
4721 curr = NULL;
4722
4723 /*
4724 * This call to check_cfs_rq_runtime() will do the throttle and
4725 * dequeue its entity in the parent(s). Therefore the 'simple'
4726 * nr_running test will indeed be correct.
4727 */
4728 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4729 goto simple;
4730
4731 se = pick_next_entity(cfs_rq, curr);
4732 cfs_rq = group_cfs_rq(se);
4733 } while (cfs_rq);
4734
4735 p = task_of(se);
4736
4737 /*
4738 * Since we haven't yet done put_prev_entity and if the selected task
4739 * is a different task than we started out with, try and touch the
4740 * least amount of cfs_rqs.
4741 */
4742 if (prev != p) {
4743 struct sched_entity *pse = &prev->se;
4744
4745 while (!(cfs_rq = is_same_group(se, pse))) {
4746 int se_depth = se->depth;
4747 int pse_depth = pse->depth;
4748
4749 if (se_depth <= pse_depth) {
4750 put_prev_entity(cfs_rq_of(pse), pse);
4751 pse = parent_entity(pse);
4752 }
4753 if (se_depth >= pse_depth) {
4754 set_next_entity(cfs_rq_of(se), se);
4755 se = parent_entity(se);
4756 }
4757 }
4758
4759 put_prev_entity(cfs_rq, pse);
4760 set_next_entity(cfs_rq, se);
4761 }
4762
4763 if (hrtick_enabled(rq))
4764 hrtick_start_fair(rq, p);
4765
4766 return p;
4767simple:
4768 cfs_rq = &rq->cfs;
4769#endif
4770
4771 if (!cfs_rq->nr_running)
4772 goto idle;
4773
4774 put_prev_task(rq, prev);
4505 4775
4506 do { 4776 do {
4507 se = pick_next_entity(cfs_rq); 4777 se = pick_next_entity(cfs_rq, NULL);
4508 set_next_entity(cfs_rq, se); 4778 set_next_entity(cfs_rq, se);
4509 cfs_rq = group_cfs_rq(se); 4779 cfs_rq = group_cfs_rq(se);
4510 } while (cfs_rq); 4780 } while (cfs_rq);
4511 4781
4512 p = task_of(se); 4782 p = task_of(se);
4783
4513 if (hrtick_enabled(rq)) 4784 if (hrtick_enabled(rq))
4514 hrtick_start_fair(rq, p); 4785 hrtick_start_fair(rq, p);
4515 4786
4516 return p; 4787 return p;
4788
4789idle:
4790 new_tasks = idle_balance(rq);
4791 /*
4792 * Because idle_balance() releases (and re-acquires) rq->lock, it is
4793 * possible for any higher priority task to appear. In that case we
4794 * must re-start the pick_next_entity() loop.
4795 */
4796 if (new_tasks < 0)
4797 return RETRY_TASK;
4798
4799 if (new_tasks > 0)
4800 goto again;
4801
4802 return NULL;
4517} 4803}
4518 4804
4519/* 4805/*
@@ -4751,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
4751 * Is this task likely cache-hot: 5037 * Is this task likely cache-hot:
4752 */ 5038 */
4753static int 5039static int
4754task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 5040task_hot(struct task_struct *p, u64 now)
4755{ 5041{
4756 s64 delta; 5042 s64 delta;
4757 5043
@@ -4785,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4785{ 5071{
4786 int src_nid, dst_nid; 5072 int src_nid, dst_nid;
4787 5073
4788 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5074 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4789 !(env->sd->flags & SD_NUMA)) { 5075 !(env->sd->flags & SD_NUMA)) {
4790 return false; 5076 return false;
4791 } 5077 }
@@ -4816,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4816 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5102 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4817 return false; 5103 return false;
4818 5104
4819 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5105 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4820 return false; 5106 return false;
4821 5107
4822 src_nid = cpu_to_node(env->src_cpu); 5108 src_nid = cpu_to_node(env->src_cpu);
@@ -4912,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4912 * 2) task is cache cold, or 5198 * 2) task is cache cold, or
4913 * 3) too many balance attempts have failed. 5199 * 3) too many balance attempts have failed.
4914 */ 5200 */
4915 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 5201 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
4916 if (!tsk_cache_hot) 5202 if (!tsk_cache_hot)
4917 tsk_cache_hot = migrate_degrades_locality(p, env); 5203 tsk_cache_hot = migrate_degrades_locality(p, env);
4918 5204
@@ -5775,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5775 pwr_now /= SCHED_POWER_SCALE; 6061 pwr_now /= SCHED_POWER_SCALE;
5776 6062
5777 /* Amount of load we'd subtract */ 6063 /* Amount of load we'd subtract */
5778 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6064 if (busiest->avg_load > scaled_busy_load_per_task) {
5779 busiest->group_power;
5780 if (busiest->avg_load > tmp) {
5781 pwr_move += busiest->group_power * 6065 pwr_move += busiest->group_power *
5782 min(busiest->load_per_task, 6066 min(busiest->load_per_task,
5783 busiest->avg_load - tmp); 6067 busiest->avg_load - scaled_busy_load_per_task);
5784 } 6068 }
5785 6069
5786 /* Amount of load we'd add */ 6070 /* Amount of load we'd add */
@@ -6359,17 +6643,23 @@ out:
6359 * idle_balance is called by schedule() if this_cpu is about to become 6643 * idle_balance is called by schedule() if this_cpu is about to become
6360 * idle. Attempts to pull tasks from other CPUs. 6644 * idle. Attempts to pull tasks from other CPUs.
6361 */ 6645 */
6362void idle_balance(int this_cpu, struct rq *this_rq) 6646static int idle_balance(struct rq *this_rq)
6363{ 6647{
6364 struct sched_domain *sd; 6648 struct sched_domain *sd;
6365 int pulled_task = 0; 6649 int pulled_task = 0;
6366 unsigned long next_balance = jiffies + HZ; 6650 unsigned long next_balance = jiffies + HZ;
6367 u64 curr_cost = 0; 6651 u64 curr_cost = 0;
6652 int this_cpu = this_rq->cpu;
6368 6653
6654 idle_enter_fair(this_rq);
6655 /*
6656 * We must set idle_stamp _before_ calling idle_balance(), such that we
6657 * measure the duration of idle_balance() as idle time.
6658 */
6369 this_rq->idle_stamp = rq_clock(this_rq); 6659 this_rq->idle_stamp = rq_clock(this_rq);
6370 6660
6371 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6661 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6372 return; 6662 goto out;
6373 6663
6374 /* 6664 /*
6375 * Drop the rq->lock, but keep IRQ/preempt disabled. 6665 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6407,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6407 interval = msecs_to_jiffies(sd->balance_interval); 6697 interval = msecs_to_jiffies(sd->balance_interval);
6408 if (time_after(next_balance, sd->last_balance + interval)) 6698 if (time_after(next_balance, sd->last_balance + interval))
6409 next_balance = sd->last_balance + interval; 6699 next_balance = sd->last_balance + interval;
6410 if (pulled_task) { 6700 if (pulled_task)
6411 this_rq->idle_stamp = 0;
6412 break; 6701 break;
6413 }
6414 } 6702 }
6415 rcu_read_unlock(); 6703 rcu_read_unlock();
6416 6704
6417 raw_spin_lock(&this_rq->lock); 6705 raw_spin_lock(&this_rq->lock);
6418 6706
6707 /*
6708 * While browsing the domains, we released the rq lock.
6709 * A task could have be enqueued in the meantime
6710 */
6711 if (this_rq->cfs.h_nr_running && !pulled_task) {
6712 pulled_task = 1;
6713 goto out;
6714 }
6715
6419 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6716 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6420 /* 6717 /*
6421 * We are going idle. next_balance may be set based on 6718 * We are going idle. next_balance may be set based on
@@ -6426,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6426 6723
6427 if (curr_cost > this_rq->max_idle_balance_cost) 6724 if (curr_cost > this_rq->max_idle_balance_cost)
6428 this_rq->max_idle_balance_cost = curr_cost; 6725 this_rq->max_idle_balance_cost = curr_cost;
6726
6727out:
6728 /* Is there a task of a high priority class? */
6729 if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
6730 (this_rq->dl.dl_nr_running ||
6731 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6732 pulled_task = -1;
6733
6734 if (pulled_task) {
6735 idle_exit_fair(this_rq);
6736 this_rq->idle_stamp = 0;
6737 }
6738
6739 return pulled_task;
6429} 6740}
6430 6741
6431/* 6742/*
@@ -6496,6 +6807,11 @@ out_unlock:
6496 return 0; 6807 return 0;
6497} 6808}
6498 6809
6810static inline int on_null_domain(struct rq *rq)
6811{
6812 return unlikely(!rcu_dereference_sched(rq->sd));
6813}
6814
6499#ifdef CONFIG_NO_HZ_COMMON 6815#ifdef CONFIG_NO_HZ_COMMON
6500/* 6816/*
6501 * idle load balancing details 6817 * idle load balancing details
@@ -6550,8 +6866,13 @@ static void nohz_balancer_kick(void)
6550static inline void nohz_balance_exit_idle(int cpu) 6866static inline void nohz_balance_exit_idle(int cpu)
6551{ 6867{
6552 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 6868 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6553 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 6869 /*
6554 atomic_dec(&nohz.nr_cpus); 6870 * Completely isolated CPUs don't ever set, so we must test.
6871 */
6872 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6873 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6874 atomic_dec(&nohz.nr_cpus);
6875 }
6555 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6876 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6556 } 6877 }
6557} 6878}
@@ -6605,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
6605 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 6926 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6606 return; 6927 return;
6607 6928
6929 /*
6930 * If we're a completely isolated CPU, we don't play.
6931 */
6932 if (on_null_domain(cpu_rq(cpu)))
6933 return;
6934
6608 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 6935 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6609 atomic_inc(&nohz.nr_cpus); 6936 atomic_inc(&nohz.nr_cpus);
6610 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6937 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6867,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
6867 nohz_idle_balance(this_rq, idle); 7194 nohz_idle_balance(this_rq, idle);
6868} 7195}
6869 7196
6870static inline int on_null_domain(struct rq *rq)
6871{
6872 return !rcu_dereference_sched(rq->sd);
6873}
6874
6875/* 7197/*
6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 7198 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6877 */ 7199 */
@@ -7036,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7036 */ 7358 */
7037static void switched_to_fair(struct rq *rq, struct task_struct *p) 7359static void switched_to_fair(struct rq *rq, struct task_struct *p)
7038{ 7360{
7039 if (!p->se.on_rq) 7361 struct sched_entity *se = &p->se;
7362#ifdef CONFIG_FAIR_GROUP_SCHED
7363 /*
7364 * Since the real-depth could have been changed (only FAIR
7365 * class maintain depth value), reset depth properly.
7366 */
7367 se->depth = se->parent ? se->parent->depth + 1 : 0;
7368#endif
7369 if (!se->on_rq)
7040 return; 7370 return;
7041 7371
7042 /* 7372 /*
@@ -7084,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7084#ifdef CONFIG_FAIR_GROUP_SCHED 7414#ifdef CONFIG_FAIR_GROUP_SCHED
7085static void task_move_group_fair(struct task_struct *p, int on_rq) 7415static void task_move_group_fair(struct task_struct *p, int on_rq)
7086{ 7416{
7417 struct sched_entity *se = &p->se;
7087 struct cfs_rq *cfs_rq; 7418 struct cfs_rq *cfs_rq;
7419
7088 /* 7420 /*
7089 * If the task was not on the rq at the time of this cgroup movement 7421 * If the task was not on the rq at the time of this cgroup movement
7090 * it must have been asleep, sleeping tasks keep their ->vruntime 7422 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7110,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7110 * To prevent boost or penalty in the new cfs_rq caused by delta 7442 * To prevent boost or penalty in the new cfs_rq caused by delta
7111 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7443 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7112 */ 7444 */
7113 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7445 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7114 on_rq = 1; 7446 on_rq = 1;
7115 7447
7116 if (!on_rq) 7448 if (!on_rq)
7117 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7449 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7118 set_task_rq(p, task_cpu(p)); 7450 set_task_rq(p, task_cpu(p));
7451 se->depth = se->parent ? se->parent->depth + 1 : 0;
7119 if (!on_rq) { 7452 if (!on_rq) {
7120 cfs_rq = cfs_rq_of(&p->se); 7453 cfs_rq = cfs_rq_of(se);
7121 p->se.vruntime += cfs_rq->min_vruntime; 7454 se->vruntime += cfs_rq->min_vruntime;
7122#ifdef CONFIG_SMP 7455#ifdef CONFIG_SMP
7123 /* 7456 /*
7124 * migrate_task_rq_fair() will have removed our previous 7457 * migrate_task_rq_fair() will have removed our previous
7125 * contribution, but we must synchronize for ongoing future 7458 * contribution, but we must synchronize for ongoing future
7126 * decay. 7459 * decay.
7127 */ 7460 */
7128 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7461 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7129 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7462 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7130#endif 7463#endif
7131 } 7464 }
7132} 7465}
@@ -7222,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7222 if (!se) 7555 if (!se)
7223 return; 7556 return;
7224 7557
7225 if (!parent) 7558 if (!parent) {
7226 se->cfs_rq = &rq->cfs; 7559 se->cfs_rq = &rq->cfs;
7227 else 7560 se->depth = 0;
7561 } else {
7228 se->cfs_rq = parent->my_q; 7562 se->cfs_rq = parent->my_q;
7563 se->depth = parent->depth + 1;
7564 }
7229 7565
7230 se->my_q = cfs_rq; 7566 se->my_q = cfs_rq;
7231 /* guarantee group entities always have weight */ 7567 /* guarantee group entities always have weight */
diff --git a/kernel/cpu/idle.c b/kernel/sched/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/sched/idle.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
6#include <linux/tick.h> 7#include <linux/tick.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
95 if (!current_clr_polling_and_test()) { 96 if (!current_clr_polling_and_test()) {
96 stop_critical_timings(); 97 stop_critical_timings();
97 rcu_idle_enter(); 98 rcu_idle_enter();
98 arch_cpu_idle(); 99 if (cpuidle_idle_call())
99 WARN_ON_ONCE(irqs_disabled()); 100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
100 rcu_idle_exit(); 103 rcu_idle_exit();
101 start_critical_timings(); 104 start_critical_timings();
102 } else { 105 } else {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 put_prev_task(rq, prev);
30
38 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
43 return rq->idle; 32 return rq->idle;
44} 33}
45 34
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 47
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 49{
50 idle_exit_fair(rq);
51 rq_last_tick_reset(rq);
61} 52}
62 53
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
101 92
102#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 94 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 95#endif
107 96
108 .set_curr_task = set_curr_task_idle, 97 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1999021042c7..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
234static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
235{
236 /* Try to pull RT tasks here if we lower this rq's prio */
237 return rq->rt.highest_prio.curr > prev->prio;
238}
239
232static inline int rt_overloaded(struct rq *rq) 240static inline int rt_overloaded(struct rq *rq)
233{ 241{
234 return atomic_read(&rq->rd->rto_count); 242 return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
315 return !plist_head_empty(&rq->rt.pushable_tasks); 323 return !plist_head_empty(&rq->rt.pushable_tasks);
316} 324}
317 325
326static inline void set_post_schedule(struct rq *rq)
327{
328 /*
329 * We detect this state here so that we can avoid taking the RQ
330 * lock again later if there is no need to push
331 */
332 rq->post_schedule = has_pushable_tasks(rq);
333}
334
318static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 335static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
319{ 336{
320 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 337 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
359{ 376{
360} 377}
361 378
379static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
380{
381 return false;
382}
383
384static inline int pull_rt_task(struct rq *this_rq)
385{
386 return 0;
387}
388
389static inline void set_post_schedule(struct rq *rq)
390{
391}
362#endif /* CONFIG_SMP */ 392#endif /* CONFIG_SMP */
363 393
364static inline int on_rt_rq(struct sched_rt_entity *rt_se) 394static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
440 dequeue_rt_entity(rt_se); 470 dequeue_rt_entity(rt_se);
441} 471}
442 472
443static inline int rt_rq_throttled(struct rt_rq *rt_rq)
444{
445 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
446}
447
448static int rt_se_boosted(struct sched_rt_entity *rt_se) 473static int rt_se_boosted(struct sched_rt_entity *rt_se)
449{ 474{
450 struct rt_rq *rt_rq = group_rt_rq(rt_se); 475 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
515{ 540{
516} 541}
517 542
518static inline int rt_rq_throttled(struct rt_rq *rt_rq)
519{
520 return rt_rq->rt_throttled;
521}
522
523static inline const struct cpumask *sched_rt_period_mask(void) 543static inline const struct cpumask *sched_rt_period_mask(void)
524{ 544{
525 return cpu_online_mask; 545 return cpu_online_mask;
@@ -1318,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1318{ 1338{
1319 struct sched_rt_entity *rt_se; 1339 struct sched_rt_entity *rt_se;
1320 struct task_struct *p; 1340 struct task_struct *p;
1321 struct rt_rq *rt_rq; 1341 struct rt_rq *rt_rq = &rq->rt;
1322
1323 rt_rq = &rq->rt;
1324
1325 if (!rt_rq->rt_nr_running)
1326 return NULL;
1327
1328 if (rt_rq_throttled(rt_rq))
1329 return NULL;
1330 1342
1331 do { 1343 do {
1332 rt_se = pick_next_rt_entity(rq, rt_rq); 1344 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1340,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1340 return p; 1352 return p;
1341} 1353}
1342 1354
1343static struct task_struct *pick_next_task_rt(struct rq *rq) 1355static struct task_struct *
1356pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1344{ 1357{
1345 struct task_struct *p = _pick_next_task_rt(rq); 1358 struct task_struct *p;
1359 struct rt_rq *rt_rq = &rq->rt;
1360
1361 if (need_pull_rt_task(rq, prev)) {
1362 pull_rt_task(rq);
1363 /*
1364 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1365 * means a dl task can slip in, in which case we need to
1366 * re-start task selection.
1367 */
1368 if (unlikely(rq->dl.dl_nr_running))
1369 return RETRY_TASK;
1370 }
1371
1372 /*
1373 * We may dequeue prev's rt_rq in put_prev_task().
1374 * So, we update time before rt_nr_running check.
1375 */
1376 if (prev->sched_class == &rt_sched_class)
1377 update_curr_rt(rq);
1378
1379 if (!rt_rq->rt_nr_running)
1380 return NULL;
1381
1382 if (rt_rq_throttled(rt_rq))
1383 return NULL;
1384
1385 put_prev_task(rq, prev);
1386
1387 p = _pick_next_task_rt(rq);
1346 1388
1347 /* The running task is never eligible for pushing */ 1389 /* The running task is never eligible for pushing */
1348 if (p) 1390 if (p)
1349 dequeue_pushable_task(rq, p); 1391 dequeue_pushable_task(rq, p);
1350 1392
1351#ifdef CONFIG_SMP 1393 set_post_schedule(rq);
1352 /*
1353 * We detect this state here so that we can avoid taking the RQ
1354 * lock again later if there is no need to push
1355 */
1356 rq->post_schedule = has_pushable_tasks(rq);
1357#endif
1358 1394
1359 return p; 1395 return p;
1360} 1396}
@@ -1724,13 +1760,6 @@ skip:
1724 return ret; 1760 return ret;
1725} 1761}
1726 1762
1727static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1728{
1729 /* Try to pull RT tasks here if we lower this rq's prio */
1730 if (rq->rt.highest_prio.curr > prev->prio)
1731 pull_rt_task(rq);
1732}
1733
1734static void post_schedule_rt(struct rq *rq) 1763static void post_schedule_rt(struct rq *rq)
1735{ 1764{
1736 push_rt_tasks(rq); 1765 push_rt_tasks(rq);
@@ -1833,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1833 resched_task(rq->curr); 1862 resched_task(rq->curr);
1834} 1863}
1835 1864
1836void init_sched_rt_class(void) 1865void __init init_sched_rt_class(void)
1837{ 1866{
1838 unsigned int i; 1867 unsigned int i;
1839 1868
@@ -2007,7 +2036,6 @@ const struct sched_class rt_sched_class = {
2007 .set_cpus_allowed = set_cpus_allowed_rt, 2036 .set_cpus_allowed = set_cpus_allowed_rt,
2008 .rq_online = rq_online_rt, 2037 .rq_online = rq_online_rt,
2009 .rq_offline = rq_offline_rt, 2038 .rq_offline = rq_offline_rt,
2010 .pre_schedule = pre_schedule_rt,
2011 .post_schedule = post_schedule_rt, 2039 .post_schedule = post_schedule_rt,
2012 .task_woken = task_woken_rt, 2040 .task_woken = task_woken_rt,
2013 .switched_from = switched_from_rt, 2041 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f964add50f38..f2de7a175620 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
441#endif 423#endif
442}; 424};
443 425
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
444/* Deadline class' related fields in a runqueue */ 438/* Deadline class' related fields in a runqueue */
445struct dl_rq { 439struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */ 440 /* runqueue is an rbtree, ordered by deadline */
@@ -558,11 +552,9 @@ struct rq {
558#ifdef CONFIG_FAIR_GROUP_SCHED 552#ifdef CONFIG_FAIR_GROUP_SCHED
559 /* list of leaf cfs_rq on this cpu: */ 553 /* list of leaf cfs_rq on this cpu: */
560 struct list_head leaf_cfs_rq_list; 554 struct list_head leaf_cfs_rq_list;
561#endif /* CONFIG_FAIR_GROUP_SCHED */
562 555
563#ifdef CONFIG_RT_GROUP_SCHED 556 struct sched_avg avg;
564 struct list_head leaf_rt_rq_list; 557#endif /* CONFIG_FAIR_GROUP_SCHED */
565#endif
566 558
567 /* 559 /*
568 * This is part of a global counter where only the total sum 560 * This is part of a global counter where only the total sum
@@ -651,8 +643,6 @@ struct rq {
651#ifdef CONFIG_SMP 643#ifdef CONFIG_SMP
652 struct llist_head wake_list; 644 struct llist_head wake_list;
653#endif 645#endif
654
655 struct sched_avg avg;
656}; 646};
657 647
658static inline int cpu_of(struct rq *rq) 648static inline int cpu_of(struct rq *rq)
@@ -1112,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
1112 1102
1113#define DEQUEUE_SLEEP 1 1103#define DEQUEUE_SLEEP 1
1114 1104
1105#define RETRY_TASK ((void *)-1UL)
1106
1115struct sched_class { 1107struct sched_class {
1116 const struct sched_class *next; 1108 const struct sched_class *next;
1117 1109
@@ -1122,14 +1114,22 @@ struct sched_class {
1122 1114
1123 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1115 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1124 1116
1125 struct task_struct * (*pick_next_task) (struct rq *rq); 1117 /*
1118 * It is the responsibility of the pick_next_task() method that will
1119 * return the next task to call put_prev_task() on the @prev task or
1120 * something equivalent.
1121 *
1122 * May return RETRY_TASK when it finds a higher prio class has runnable
1123 * tasks.
1124 */
1125 struct task_struct * (*pick_next_task) (struct rq *rq,
1126 struct task_struct *prev);
1126 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1127 1128
1128#ifdef CONFIG_SMP 1129#ifdef CONFIG_SMP
1129 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1130 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1131 1132
1132 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1133 void (*post_schedule) (struct rq *this_rq); 1133 void (*post_schedule) (struct rq *this_rq);
1134 void (*task_waking) (struct task_struct *task); 1134 void (*task_waking) (struct task_struct *task);
1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1159,6 +1159,11 @@ struct sched_class {
1159#endif 1159#endif
1160}; 1160};
1161 1161
1162static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1163{
1164 prev->sched_class->put_prev_task(rq, prev);
1165}
1166
1162#define sched_class_highest (&stop_sched_class) 1167#define sched_class_highest (&stop_sched_class)
1163#define for_each_class(class) \ 1168#define for_each_class(class) \
1164 for (class = sched_class_highest; class; class = class->next) 1169 for (class = sched_class_highest; class; class = class->next)
@@ -1175,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
1175extern void update_group_power(struct sched_domain *sd, int cpu); 1180extern void update_group_power(struct sched_domain *sd, int cpu);
1176 1181
1177extern void trigger_load_balance(struct rq *rq); 1182extern void trigger_load_balance(struct rq *rq);
1178extern void idle_balance(int this_cpu, struct rq *this_rq);
1179 1183
1180extern void idle_enter_fair(struct rq *this_rq); 1184extern void idle_enter_fair(struct rq *this_rq);
1181extern void idle_exit_fair(struct rq *this_rq); 1185extern void idle_exit_fair(struct rq *this_rq);
1182 1186
1183#else /* CONFIG_SMP */ 1187#else
1184 1188
1185static inline void idle_balance(int cpu, struct rq *rq) 1189static inline void idle_enter_fair(struct rq *rq) { }
1186{ 1190static inline void idle_exit_fair(struct rq *rq) { }
1187}
1188 1191
1189#endif 1192#endif
1190 1193
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 put_prev_task(rq, prev);
35
36 stop->se.exec_start = rq_clock_task(rq);
37
38 return stop;
36} 39}
37 40
38static void 41static void
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
174 174
175 /* normalize: avoid signed division (rounding problems) */ 175 /* normalize: avoid signed division (rounding problems) */
176 error = -ESRCH; 176 error = -ESRCH;
177 if (niceval < -20) 177 if (niceval < MIN_NICE)
178 niceval = -20; 178 niceval = MIN_NICE;
179 if (niceval > 19) 179 if (niceval > MAX_NICE)
180 niceval = 19; 180 niceval = MAX_NICE;
181 181
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
387 }, 387 },
388 { 388 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 389 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 390 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
40module_param(write_iteration, uint, 0644); 40module_param(write_iteration, uint, 0644);
41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); 41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
42 42
43static int producer_nice = 19; 43static int producer_nice = MAX_NICE;
44static int consumer_nice = 19; 44static int consumer_nice = MAX_NICE;
45 45
46static int producer_fifo = -1; 46static int producer_fifo = -1;
47static int consumer_fifo = -1; 47static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
308 308
309 /* Let the user know that the test is running at low priority */ 309 /* Let the user know that the test is running at low priority */
310 if (producer_fifo < 0 && consumer_fifo < 0 && 310 if (producer_fifo < 0 && consumer_fifo < 0 &&
311 producer_nice == 19 && consumer_nice == 19) 311 producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
312 trace_printk("WARNING!!! This test is running at lowest priority.\n"); 312 trace_printk("WARNING!!! This test is running at lowest priority.\n");
313 313
314 trace_printk("Time: %lld (usecs)\n", time); 314 trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977a10ea..3fa5b8f3aae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3225,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3225 return -ENOMEM; 3225 return -ENOMEM;
3226 3226
3227 if (sscanf(buf, "%d", &attrs->nice) == 1 && 3227 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3228 attrs->nice >= -20 && attrs->nice <= 19) 3228 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3229 ret = apply_workqueue_attrs(wq, attrs); 3229 ret = apply_workqueue_attrs(wq, attrs);
3230 else 3230 else
3231 ret = -EINVAL; 3231 ret = -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ae3c8f3595d4..f520b9da9c1f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n)
2301 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2302} 2302}
2303 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2333/** 2304/**
2334 * mpol_misplaced - check whether current page node is valid in policy 2305 * mpol_misplaced - check whether current page node is valid in policy
2335 * 2306 *
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2403 2374
2404 /* Migrate the page towards the node whose CPU is referencing it */ 2375 /* Migrate the page towards the node whose CPU is referencing it */
2405 if (pol->flags & MPOL_F_MORON) { 2376 if (pol->flags & MPOL_F_MORON) {
2406 int last_cpupid;
2407 int this_cpupid;
2408
2409 polnid = thisnid; 2377 polnid = thisnid;
2410 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2411
2412 /*
2413 * Multi-stage node selection is used in conjunction
2414 * with a periodic migration fault to build a temporal
2415 * task<->page relation. By using a two-stage filter we
2416 * remove short/unlikely relations.
2417 *
2418 * Using P(p) ~ n_p / n_t as per frequentist
2419 * probability, we can equate a task's usage of a
2420 * particular page (n_p) per total usage of this
2421 * page (n_t) (in a given time-span) to a probability.
2422 *
2423 * Our periodic faults will sample this probability and
2424 * getting the same result twice in a row, given these
2425 * samples are fully independent, is then given by
2426 * P(n)^2, provided our sample period is sufficiently
2427 * short compared to the usage pattern.
2428 *
2429 * This quadric squishes small probabilities, making
2430 * it less likely we act on an unlikely task<->page
2431 * relation.
2432 */
2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435 2378
2436 /* See sysctl_numa_balancing_migrate_deferred comment */ 2379 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2452 goto out; 2380 goto out;
2453 } 2381 }
2454 2382