aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt10
-rw-r--r--arch/arm/kernel/process.c16
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c1
-rw-r--r--arch/powerpc/platforms/pseries/setup.c34
-rw-r--r--arch/sh/kernel/idle.c4
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--drivers/cpuidle/cpuidle-pseries.c6
-rw-r--r--include/linux/sched.h41
-rw-r--r--include/linux/sched/prio.h40
-rw-r--r--include/linux/sched/rt.h19
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c7
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c79
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c22
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c503
-rw-r--r--kernel/sched/idle.c144
-rw-r--r--kernel/sched/idle_task.c27
-rw-r--r--kernel/sched/rt.c43
-rw-r--r--kernel/sched/sched.h29
-rw-r--r--kernel/sched/stop_task.c16
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/mempolicy.c74
26 files changed, 746 insertions, 396 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index e55124e7c40c..04bf16ad8561 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -441,8 +441,7 @@ feature should be disabled. Otherwise, if the system overhead from the
441feature is too high then the rate the kernel samples for NUMA hinting 441feature is too high then the rate the kernel samples for NUMA hinting
442faults may be controlled by the numa_balancing_scan_period_min_ms, 442faults may be controlled by the numa_balancing_scan_period_min_ms,
443numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, 443numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
444numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and 444numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
445numa_balancing_migrate_deferred.
446 445
447============================================================== 446==============================================================
448 447
@@ -483,13 +482,6 @@ rate for each task.
483numa_balancing_scan_size_mb is how many megabytes worth of pages are 482numa_balancing_scan_size_mb is how many megabytes worth of pages are
484scanned for a given scan. 483scanned for a given scan.
485 484
486numa_balancing_migrate_deferred is how many page migrations get skipped
487unconditionally, after a page migration is skipped because a page is shared
488with other tasks. This reduces page migration overhead, and determines
489how much stronger the "move task near its memory" policy scheduler becomes,
490versus the "move memory near its task" memory management policy, for workloads
491with shared memory.
492
493============================================================== 485==============================================================
494 486
495osrelease, ostype & version: 487osrelease, ostype & version:
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 92f7b15dd221..adabeababeb0 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,7 +30,6 @@
30#include <linux/uaccess.h> 30#include <linux/uaccess.h>
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/hw_breakpoint.h> 32#include <linux/hw_breakpoint.h>
33#include <linux/cpuidle.h>
34#include <linux/leds.h> 33#include <linux/leds.h>
35#include <linux/reboot.h> 34#include <linux/reboot.h>
36 35
@@ -133,7 +132,11 @@ EXPORT_SYMBOL_GPL(arm_pm_restart);
133 132
134void (*arm_pm_idle)(void); 133void (*arm_pm_idle)(void);
135 134
136static void default_idle(void) 135/*
136 * Called from the core idle loop.
137 */
138
139void arch_cpu_idle(void)
137{ 140{
138 if (arm_pm_idle) 141 if (arm_pm_idle)
139 arm_pm_idle(); 142 arm_pm_idle();
@@ -168,15 +171,6 @@ void arch_cpu_idle_dead(void)
168#endif 171#endif
169 172
170/* 173/*
171 * Called from the core idle loop.
172 */
173void arch_cpu_idle(void)
174{
175 if (cpuidle_idle_call())
176 default_idle();
177}
178
179/*
180 * Called by kexec, immediately prior to machine_kexec(). 174 * Called by kexec, immediately prior to machine_kexec().
181 * 175 *
182 * This must completely disable all secondary CPUs; simply causing those CPUs 176 * This must completely disable all secondary CPUs; simply causing those CPUs
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4fa..4a0a64fe25df 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer;
83#define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1) 83#define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
84#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) 84#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
85 85
86#define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO)
87#define SCALE_PRIO(x, prio) \ 86#define SCALE_PRIO(x, prio) \
88 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) 87 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
89 88
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 972df0ffd4dc..2db8cc691bf4 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -39,7 +39,6 @@
39#include <linux/irq.h> 39#include <linux/irq.h>
40#include <linux/seq_file.h> 40#include <linux/seq_file.h>
41#include <linux/root_dev.h> 41#include <linux/root_dev.h>
42#include <linux/cpuidle.h>
43#include <linux/of.h> 42#include <linux/of.h>
44#include <linux/kexec.h> 43#include <linux/kexec.h>
45 44
@@ -356,29 +355,24 @@ early_initcall(alloc_dispatch_log_kmem_cache);
356 355
357static void pseries_lpar_idle(void) 356static void pseries_lpar_idle(void)
358{ 357{
359 /* This would call on the cpuidle framework, and the back-end pseries 358 /*
360 * driver to go to idle states 359 * Default handler to go into low thread priority and possibly
360 * low power mode by cedeing processor to hypervisor
361 */ 361 */
362 if (cpuidle_idle_call()) {
363 /* On error, execute default handler
364 * to go into low thread priority and possibly
365 * low power mode by cedeing processor to hypervisor
366 */
367 362
368 /* Indicate to hypervisor that we are idle. */ 363 /* Indicate to hypervisor that we are idle. */
369 get_lppaca()->idle = 1; 364 get_lppaca()->idle = 1;
370 365
371 /* 366 /*
372 * Yield the processor to the hypervisor. We return if 367 * Yield the processor to the hypervisor. We return if
373 * an external interrupt occurs (which are driven prior 368 * an external interrupt occurs (which are driven prior
374 * to returning here) or if a prod occurs from another 369 * to returning here) or if a prod occurs from another
375 * processor. When returning here, external interrupts 370 * processor. When returning here, external interrupts
376 * are enabled. 371 * are enabled.
377 */ 372 */
378 cede_processor(); 373 cede_processor();
379 374
380 get_lppaca()->idle = 0; 375 get_lppaca()->idle = 0;
381 }
382} 376}
383 377
384/* 378/*
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 2ea4483fd722..be616ee0cf87 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,7 +16,6 @@
16#include <linux/thread_info.h> 16#include <linux/thread_info.h>
17#include <linux/irqflags.h> 17#include <linux/irqflags.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/cpuidle.h>
20#include <linux/atomic.h> 19#include <linux/atomic.h>
21#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
22#include <asm/smp.h> 21#include <asm/smp.h>
@@ -40,8 +39,7 @@ void arch_cpu_idle_dead(void)
40 39
41void arch_cpu_idle(void) 40void arch_cpu_idle(void)
42{ 41{
43 if (cpuidle_idle_call()) 42 sh_idle();
44 sh_idle();
45} 43}
46 44
47void __init select_idle_routine(void) 45void __init select_idle_routine(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b5..4505e2a950d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
298 */ 298 */
299void arch_cpu_idle(void) 299void arch_cpu_idle(void)
300{ 300{
301 if (cpuidle_idle_call()) 301 x86_idle();
302 x86_idle();
303 else
304 local_irq_enable();
305} 302}
306 303
307/* 304/*
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 7ab564aa0b1c..6f7b01956885 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -17,6 +17,7 @@
17#include <asm/reg.h> 17#include <asm/reg.h>
18#include <asm/machdep.h> 18#include <asm/machdep.h>
19#include <asm/firmware.h> 19#include <asm/firmware.h>
20#include <asm/runlatch.h>
20#include <asm/plpar_wrappers.h> 21#include <asm/plpar_wrappers.h>
21 22
22struct cpuidle_driver pseries_idle_driver = { 23struct cpuidle_driver pseries_idle_driver = {
@@ -29,6 +30,7 @@ static struct cpuidle_state *cpuidle_state_table;
29 30
30static inline void idle_loop_prolog(unsigned long *in_purr) 31static inline void idle_loop_prolog(unsigned long *in_purr)
31{ 32{
33 ppc64_runlatch_off();
32 *in_purr = mfspr(SPRN_PURR); 34 *in_purr = mfspr(SPRN_PURR);
33 /* 35 /*
34 * Indicate to the HV that we are idle. Now would be 36 * Indicate to the HV that we are idle. Now would be
@@ -45,6 +47,10 @@ static inline void idle_loop_epilog(unsigned long in_purr)
45 wait_cycles += mfspr(SPRN_PURR) - in_purr; 47 wait_cycles += mfspr(SPRN_PURR) - in_purr;
46 get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); 48 get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
47 get_lppaca()->idle = 0; 49 get_lppaca()->idle = 0;
50
51 if (irqs_disabled())
52 local_irq_enable();
53 ppc64_runlatch_on();
48} 54}
49 55
50static int snooze_loop(struct cpuidle_device *dev, 56static int snooze_loop(struct cpuidle_device *dev,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec1cd0b..c49a2585ff7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
3 3
4#include <uapi/linux/sched.h> 4#include <uapi/linux/sched.h>
5 5
6#include <linux/sched/prio.h>
7
6 8
7struct sched_param { 9struct sched_param {
8 int sched_priority; 10 int sched_priority;
@@ -1077,6 +1079,7 @@ struct sched_entity {
1077#endif 1079#endif
1078 1080
1079#ifdef CONFIG_FAIR_GROUP_SCHED 1081#ifdef CONFIG_FAIR_GROUP_SCHED
1082 int depth;
1080 struct sched_entity *parent; 1083 struct sched_entity *parent;
1081 /* rq on which this entity is (to be) queued: */ 1084 /* rq on which this entity is (to be) queued: */
1082 struct cfs_rq *cfs_rq; 1085 struct cfs_rq *cfs_rq;
@@ -1470,9 +1473,10 @@ struct task_struct {
1470 unsigned int numa_scan_period; 1473 unsigned int numa_scan_period;
1471 unsigned int numa_scan_period_max; 1474 unsigned int numa_scan_period_max;
1472 int numa_preferred_nid; 1475 int numa_preferred_nid;
1473 int numa_migrate_deferred;
1474 unsigned long numa_migrate_retry; 1476 unsigned long numa_migrate_retry;
1475 u64 node_stamp; /* migration stamp */ 1477 u64 node_stamp; /* migration stamp */
1478 u64 last_task_numa_placement;
1479 u64 last_sum_exec_runtime;
1476 struct callback_head numa_work; 1480 struct callback_head numa_work;
1477 1481
1478 struct list_head numa_entry; 1482 struct list_head numa_entry;
@@ -1483,15 +1487,22 @@ struct task_struct {
1483 * Scheduling placement decisions are made based on the these counts. 1487 * Scheduling placement decisions are made based on the these counts.
1484 * The values remain static for the duration of a PTE scan 1488 * The values remain static for the duration of a PTE scan
1485 */ 1489 */
1486 unsigned long *numa_faults; 1490 unsigned long *numa_faults_memory;
1487 unsigned long total_numa_faults; 1491 unsigned long total_numa_faults;
1488 1492
1489 /* 1493 /*
1490 * numa_faults_buffer records faults per node during the current 1494 * numa_faults_buffer records faults per node during the current
1491 * scan window. When the scan completes, the counts in numa_faults 1495 * scan window. When the scan completes, the counts in
1492 * decay and these values are copied. 1496 * numa_faults_memory decay and these values are copied.
1497 */
1498 unsigned long *numa_faults_buffer_memory;
1499
1500 /*
1501 * Track the nodes the process was running on when a NUMA hinting
1502 * fault was incurred.
1493 */ 1503 */
1494 unsigned long *numa_faults_buffer; 1504 unsigned long *numa_faults_cpu;
1505 unsigned long *numa_faults_buffer_cpu;
1495 1506
1496 /* 1507 /*
1497 * numa_faults_locality tracks if faults recorded during the last 1508 * numa_faults_locality tracks if faults recorded during the last
@@ -1596,8 +1607,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
1596extern pid_t task_numa_group_id(struct task_struct *p); 1607extern pid_t task_numa_group_id(struct task_struct *p);
1597extern void set_numabalancing_state(bool enabled); 1608extern void set_numabalancing_state(bool enabled);
1598extern void task_numa_free(struct task_struct *p); 1609extern void task_numa_free(struct task_struct *p);
1599 1610extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
1600extern unsigned int sysctl_numa_balancing_migrate_deferred; 1611 int src_nid, int dst_cpu);
1601#else 1612#else
1602static inline void task_numa_fault(int last_node, int node, int pages, 1613static inline void task_numa_fault(int last_node, int node, int pages,
1603 int flags) 1614 int flags)
@@ -1613,6 +1624,11 @@ static inline void set_numabalancing_state(bool enabled)
1613static inline void task_numa_free(struct task_struct *p) 1624static inline void task_numa_free(struct task_struct *p)
1614{ 1625{
1615} 1626}
1627static inline bool should_numa_migrate_memory(struct task_struct *p,
1628 struct page *page, int src_nid, int dst_cpu)
1629{
1630 return true;
1631}
1616#endif 1632#endif
1617 1633
1618static inline struct pid *task_pid(struct task_struct *task) 1634static inline struct pid *task_pid(struct task_struct *task)
@@ -2080,7 +2096,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2080extern bool yield_to(struct task_struct *p, bool preempt); 2096extern bool yield_to(struct task_struct *p, bool preempt);
2081extern void set_user_nice(struct task_struct *p, long nice); 2097extern void set_user_nice(struct task_struct *p, long nice);
2082extern int task_prio(const struct task_struct *p); 2098extern int task_prio(const struct task_struct *p);
2083extern int task_nice(const struct task_struct *p); 2099/**
2100 * task_nice - return the nice value of a given task.
2101 * @p: the task in question.
2102 *
2103 * Return: The nice value [ -20 ... 0 ... 19 ].
2104 */
2105static inline int task_nice(const struct task_struct *p)
2106{
2107 return PRIO_TO_NICE((p)->static_prio);
2108}
2084extern int can_nice(const struct task_struct *p, const int nice); 2109extern int can_nice(const struct task_struct *p, const int nice);
2085extern int task_curr(const struct task_struct *p); 2110extern int task_curr(const struct task_struct *p);
2086extern int idle_cpu(int cpu); 2111extern int idle_cpu(int cpu);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
new file mode 100644
index 000000000000..410ccb74c9e6
--- /dev/null
+++ b/include/linux/sched/prio.h
@@ -0,0 +1,40 @@
1#ifndef _SCHED_PRIO_H
2#define _SCHED_PRIO_H
3
4/*
5 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
6 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
7 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
8 * values are inverted: lower p->prio value means higher priority.
9 *
10 * The MAX_USER_RT_PRIO value allows the actual maximum
11 * RT priority to be separate from the value exported to
12 * user-space. This allows kernel threads to set their
13 * priority to a value higher than any user task. Note:
14 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
15 */
16
17#define MAX_USER_RT_PRIO 100
18#define MAX_RT_PRIO MAX_USER_RT_PRIO
19
20#define MAX_PRIO (MAX_RT_PRIO + 40)
21#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
22
23/*
24 * Convert user-nice values [ -20 ... 0 ... 19 ]
25 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
26 * and back.
27 */
28#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
29#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
30
31/*
32 * 'User priority' is the nice value converted to something we
33 * can work with better when scaling various scheduler parameters,
34 * it's a [ 0 ... 39 ] range.
35 */
36#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
37#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
38#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
39
40#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 34e4ebea8fce..f7453d4c5613 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,24 +1,7 @@
1#ifndef _SCHED_RT_H 1#ifndef _SCHED_RT_H
2#define _SCHED_RT_H 2#define _SCHED_RT_H
3 3
4/* 4#include <linux/sched/prio.h>
5 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
6 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
7 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
8 * values are inverted: lower p->prio value means higher priority.
9 *
10 * The MAX_USER_RT_PRIO value allows the actual maximum
11 * RT priority to be separate from the value exported to
12 * user-space. This allows kernel threads to set their
13 * priority to a value higher than any user task. Note:
14 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
15 */
16
17#define MAX_USER_RT_PRIO 100
18#define MAX_RT_PRIO MAX_USER_RT_PRIO
19
20#define MAX_PRIO (MAX_RT_PRIO + 40)
21#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
22 5
23static inline int rt_prio(int prio) 6static inline int rt_prio(int prio)
24{ 7{
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..6f1c7e5cfca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
22obj-y += locking/ 22obj-y += locking/
23obj-y += power/ 23obj-y += power/
24obj-y += printk/ 24obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 25obj-y += irq/
27obj-y += rcu/ 26obj-y += rcu/
28 27
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
6#include <linux/tick.h> 7#include <linux/tick.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
95 if (!current_clr_polling_and_test()) { 96 if (!current_clr_polling_and_test()) {
96 stop_critical_timings(); 97 stop_critical_timings();
97 rcu_idle_enter(); 98 rcu_idle_enter();
98 arch_cpu_idle(); 99 if (cpuidle_idle_call())
99 WARN_ON_ONCE(irqs_disabled()); 100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
100 rcu_idle_exit(); 103 rcu_idle_exit();
101 start_critical_timings(); 104 start_critical_timings();
102 } else { 105 } else {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..fb9764fbc537 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1748 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1749 p->numa_faults_buffer_memory = NULL;
1750 p->last_task_numa_placement = 0;
1751 p->last_sum_exec_runtime = 0;
1750 1752
1751 INIT_LIST_HEAD(&p->numa_entry); 1753 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1754 p->numa_group = NULL;
@@ -2167,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2169
2168#ifdef CONFIG_SMP 2170#ifdef CONFIG_SMP
2169 2171
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2172/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2173static inline void post_schedule(struct rq *rq)
2179{ 2174{
@@ -2191,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)
2191 2186
2192#else 2187#else
2193 2188
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2189static inline void post_schedule(struct rq *rq)
2199{ 2190{
2200} 2191}
@@ -2577,18 +2568,11 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2568 schedstat_inc(this_rq(), sched_count);
2578} 2569}
2579 2570
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2571/*
2588 * Pick up the highest-prio task: 2572 * Pick up the highest-prio task:
2589 */ 2573 */
2590static inline struct task_struct * 2574static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2575pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2576{
2593 const struct sched_class *class; 2577 const struct sched_class *class;
2594 struct task_struct *p; 2578 struct task_struct *p;
@@ -2597,14 +2581,15 @@ pick_next_task(struct rq *rq)
2597 * Optimization: we know that if all tasks are in 2581 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2582 * the fair class we can call that function directly:
2599 */ 2583 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2584 if (likely(prev->sched_class == &fair_sched_class &&
2601 p = fair_sched_class.pick_next_task(rq); 2585 rq->nr_running == rq->cfs.h_nr_running)) {
2586 p = fair_sched_class.pick_next_task(rq, prev);
2602 if (likely(p)) 2587 if (likely(p))
2603 return p; 2588 return p;
2604 } 2589 }
2605 2590
2606 for_each_class(class) { 2591 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2592 p = class->pick_next_task(rq, prev);
2608 if (p) 2593 if (p)
2609 return p; 2594 return p;
2610 } 2595 }
@@ -2700,13 +2685,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2685 switch_count = &prev->nvcsw;
2701 } 2686 }
2702 2687
2703 pre_schedule(rq, prev); 2688 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2689 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2690
2708 put_prev_task(rq, prev); 2691 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2692 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2693 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2694 rq->skip_clock_update = 0;
@@ -2998,7 +2980,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2980 unsigned long flags;
2999 struct rq *rq; 2981 struct rq *rq;
3000 2982
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2983 if (task_nice(p) == nice || nice < -20 || nice > 19)
3002 return; 2984 return;
3003 /* 2985 /*
3004 * We have to be careful, if called from sys_setpriority(), 2986 * We have to be careful, if called from sys_setpriority(),
@@ -3076,7 +3058,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3058 if (increment > 40)
3077 increment = 40; 3059 increment = 40;
3078 3060
3079 nice = TASK_NICE(current) + increment; 3061 nice = task_nice(current) + increment;
3080 if (nice < -20) 3062 if (nice < -20)
3081 nice = -20; 3063 nice = -20;
3082 if (nice > 19) 3064 if (nice > 19)
@@ -3109,18 +3091,6 @@ int task_prio(const struct task_struct *p)
3109} 3091}
3110 3092
3111/** 3093/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3094 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3095 * @cpu: the processor in question.
3126 * 3096 *
@@ -3319,7 +3289,7 @@ recheck:
3319 */ 3289 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3290 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3291 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3292 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3293 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3294 return -EPERM;
3325 } 3295 }
@@ -3343,7 +3313,7 @@ recheck:
3343 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3313 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3344 */ 3314 */
3345 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3315 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3346 if (!can_nice(p, TASK_NICE(p))) 3316 if (!can_nice(p, task_nice(p)))
3347 return -EPERM; 3317 return -EPERM;
3348 } 3318 }
3349 3319
@@ -3383,7 +3353,7 @@ recheck:
3383 * If not changing anything there's no need to proceed further: 3353 * If not changing anything there's no need to proceed further:
3384 */ 3354 */
3385 if (unlikely(policy == p->policy)) { 3355 if (unlikely(policy == p->policy)) {
3386 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3356 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3387 goto change; 3357 goto change;
3388 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3358 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3389 goto change; 3359 goto change;
@@ -3835,7 +3805,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3835 else if (task_has_rt_policy(p)) 3805 else if (task_has_rt_policy(p))
3836 attr.sched_priority = p->rt_priority; 3806 attr.sched_priority = p->rt_priority;
3837 else 3807 else
3838 attr.sched_nice = TASK_NICE(p); 3808 attr.sched_nice = task_nice(p);
3839 3809
3840 rcu_read_unlock(); 3810 rcu_read_unlock();
3841 3811
@@ -4751,7 +4721,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4751 if (rq->nr_running == 1) 4721 if (rq->nr_running == 1)
4752 break; 4722 break;
4753 4723
4754 next = pick_next_task(rq); 4724 next = pick_next_task(rq, NULL);
4755 BUG_ON(!next); 4725 BUG_ON(!next);
4756 next->sched_class->put_prev_task(rq, next); 4726 next->sched_class->put_prev_task(rq, next);
4757 4727
@@ -4841,7 +4811,7 @@ set_table_entry(struct ctl_table *entry,
4841static struct ctl_table * 4811static struct ctl_table *
4842sd_alloc_ctl_domain_table(struct sched_domain *sd) 4812sd_alloc_ctl_domain_table(struct sched_domain *sd)
4843{ 4813{
4844 struct ctl_table *table = sd_alloc_ctl_entry(13); 4814 struct ctl_table *table = sd_alloc_ctl_entry(14);
4845 4815
4846 if (table == NULL) 4816 if (table == NULL)
4847 return NULL; 4817 return NULL;
@@ -4869,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4869 sizeof(int), 0644, proc_dointvec_minmax, false); 4839 sizeof(int), 0644, proc_dointvec_minmax, false);
4870 set_table_entry(&table[10], "flags", &sd->flags, 4840 set_table_entry(&table[10], "flags", &sd->flags,
4871 sizeof(int), 0644, proc_dointvec_minmax, false); 4841 sizeof(int), 0644, proc_dointvec_minmax, false);
4872 set_table_entry(&table[11], "name", sd->name, 4842 set_table_entry(&table[11], "max_newidle_lb_cost",
4843 &sd->max_newidle_lb_cost,
4844 sizeof(long), 0644, proc_doulongvec_minmax, false);
4845 set_table_entry(&table[12], "name", sd->name,
4873 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4846 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4874 /* &table[12] is terminator */ 4847 /* &table[13] is terminator */
4875 4848
4876 return table; 4849 return table;
4877} 4850}
@@ -7008,7 +6981,7 @@ void normalize_rt_tasks(void)
7008 * Renice negative nice level userspace 6981 * Renice negative nice level userspace
7009 * tasks back to 0: 6982 * tasks back to 0:
7010 */ 6983 */
7011 if (TASK_NICE(p) < 0 && p->mm) 6984 if (task_nice(p) < 0 && p->mm)
7012 set_user_nice(p, 0); 6985 set_user_nice(p, 0);
7013 continue; 6986 continue;
7014 } 6987 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..ed31ef66ab9d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
944 resched_task(rq->curr); 944 resched_task(rq->curr);
945} 945}
946 946
947static int pull_dl_task(struct rq *this_rq);
948
947#endif /* CONFIG_SMP */ 949#endif /* CONFIG_SMP */
948 950
949/* 951/*
@@ -990,7 +992,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
990 return rb_entry(left, struct sched_dl_entity, rb_node); 992 return rb_entry(left, struct sched_dl_entity, rb_node);
991} 993}
992 994
993struct task_struct *pick_next_task_dl(struct rq *rq) 995struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
994{ 996{
995 struct sched_dl_entity *dl_se; 997 struct sched_dl_entity *dl_se;
996 struct task_struct *p; 998 struct task_struct *p;
@@ -998,9 +1000,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
998 1000
999 dl_rq = &rq->dl; 1001 dl_rq = &rq->dl;
1000 1002
1003#ifdef CONFIG_SMP
1004 if (dl_task(prev))
1005 pull_dl_task(rq);
1006#endif
1007
1001 if (unlikely(!dl_rq->dl_nr_running)) 1008 if (unlikely(!dl_rq->dl_nr_running))
1002 return NULL; 1009 return NULL;
1003 1010
1011 if (prev)
1012 prev->sched_class->put_prev_task(rq, prev);
1013
1004 dl_se = pick_next_dl_entity(rq, dl_rq); 1014 dl_se = pick_next_dl_entity(rq, dl_rq);
1005 BUG_ON(!dl_se); 1015 BUG_ON(!dl_se);
1006 1016
@@ -1426,13 +1436,6 @@ skip:
1426 return ret; 1436 return ret;
1427} 1437}
1428 1438
1429static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1430{
1431 /* Try to pull other tasks here */
1432 if (dl_task(prev))
1433 pull_dl_task(rq);
1434}
1435
1436static void post_schedule_dl(struct rq *rq) 1439static void post_schedule_dl(struct rq *rq)
1437{ 1440{
1438 push_dl_tasks(rq); 1441 push_dl_tasks(rq);
@@ -1560,7 +1563,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1560 if (unlikely(p->dl.dl_throttled)) 1563 if (unlikely(p->dl.dl_throttled))
1561 return; 1564 return;
1562 1565
1563 if (p->on_rq || rq->curr != p) { 1566 if (p->on_rq && rq->curr != p) {
1564#ifdef CONFIG_SMP 1567#ifdef CONFIG_SMP
1565 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1568 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1566 /* Only reschedule if pushing failed */ 1569 /* Only reschedule if pushing failed */
@@ -1625,7 +1628,6 @@ const struct sched_class dl_sched_class = {
1625 .set_cpus_allowed = set_cpus_allowed_dl, 1628 .set_cpus_allowed = set_cpus_allowed_dl,
1626 .rq_online = rq_online_dl, 1629 .rq_online = rq_online_dl,
1627 .rq_offline = rq_offline_dl, 1630 .rq_offline = rq_offline_dl,
1628 .pre_schedule = pre_schedule_dl,
1629 .post_schedule = post_schedule_dl, 1631 .post_schedule = post_schedule_dl,
1630 .task_woken = task_woken_dl, 1632 .task_woken = task_woken_dl,
1631#endif 1633#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
321 P(sched_goidle); 321 P(sched_goidle);
322#ifdef CONFIG_SMP 322#ifdef CONFIG_SMP
323 P64(avg_idle); 323 P64(avg_idle);
324 P64(max_idle_balance_cost);
324#endif 325#endif
325 326
326 P(ttwu_count); 327 P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 534 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 535 int cpu_current, home_node;
535 536
536 if (p->numa_faults) 537 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 538 nr_faults = p->numa_faults_memory[2*node + i];
538 539
539 cpu_current = !i ? (task_node(p) == node) : 540 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 541 (pol && node_isset(node, pol->v.nodes));
541 542
542 home_node = (p->numa_preferred_nid == node); 543 home_node = (p->numa_preferred_nid == node);
543 544
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 545 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 546 i, node, cpu_current, home_node, nr_faults);
546 } 547 }
547 } 548 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..235cfa7ad8fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -2414,7 +2571,8 @@ void idle_exit_fair(struct rq *this_rq)
2414 update_rq_runnable_avg(this_rq, 0); 2571 update_rq_runnable_avg(this_rq, 0);
2415} 2572}
2416 2573
2417#else 2574#else /* CONFIG_SMP */
2575
2418static inline void update_entity_load_avg(struct sched_entity *se, 2576static inline void update_entity_load_avg(struct sched_entity *se,
2419 int update_cfs_rq) {} 2577 int update_cfs_rq) {}
2420static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2578static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2584,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2426 int sleep) {} 2584 int sleep) {}
2427static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2585static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2428 int force_update) {} 2586 int force_update) {}
2429#endif 2587#endif /* CONFIG_SMP */
2430 2588
2431static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2589static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2432{ 2590{
@@ -2576,10 +2734,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2576{ 2734{
2577 for_each_sched_entity(se) { 2735 for_each_sched_entity(se) {
2578 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2736 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2579 if (cfs_rq->last == se) 2737 if (cfs_rq->last != se)
2580 cfs_rq->last = NULL;
2581 else
2582 break; 2738 break;
2739
2740 cfs_rq->last = NULL;
2583 } 2741 }
2584} 2742}
2585 2743
@@ -2587,10 +2745,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2587{ 2745{
2588 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2589 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2590 if (cfs_rq->next == se) 2748 if (cfs_rq->next != se)
2591 cfs_rq->next = NULL;
2592 else
2593 break; 2749 break;
2750
2751 cfs_rq->next = NULL;
2594 } 2752 }
2595} 2753}
2596 2754
@@ -2598,10 +2756,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2598{ 2756{
2599 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2600 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2601 if (cfs_rq->skip == se) 2759 if (cfs_rq->skip != se)
2602 cfs_rq->skip = NULL;
2603 else
2604 break; 2760 break;
2761
2762 cfs_rq->skip = NULL;
2605 } 2763 }
2606} 2764}
2607 2765
@@ -2744,17 +2902,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2744 * 3) pick the "last" process, for cache locality 2902 * 3) pick the "last" process, for cache locality
2745 * 4) do not run the "skip" process, if something else is available 2903 * 4) do not run the "skip" process, if something else is available
2746 */ 2904 */
2747static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2905static struct sched_entity *
2906pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2748{ 2907{
2749 struct sched_entity *se = __pick_first_entity(cfs_rq); 2908 struct sched_entity *left = __pick_first_entity(cfs_rq);
2750 struct sched_entity *left = se; 2909 struct sched_entity *se;
2910
2911 /*
2912 * If curr is set we have to see if its left of the leftmost entity
2913 * still in the tree, provided there was anything in the tree at all.
2914 */
2915 if (!left || (curr && entity_before(curr, left)))
2916 left = curr;
2917
2918 se = left; /* ideally we run the leftmost entity */
2751 2919
2752 /* 2920 /*
2753 * Avoid running the skip buddy, if running something else can 2921 * Avoid running the skip buddy, if running something else can
2754 * be done without getting too unfair. 2922 * be done without getting too unfair.
2755 */ 2923 */
2756 if (cfs_rq->skip == se) { 2924 if (cfs_rq->skip == se) {
2757 struct sched_entity *second = __pick_next_entity(se); 2925 struct sched_entity *second;
2926
2927 if (se == curr) {
2928 second = __pick_first_entity(cfs_rq);
2929 } else {
2930 second = __pick_next_entity(se);
2931 if (!second || (curr && entity_before(curr, second)))
2932 second = curr;
2933 }
2934
2758 if (second && wakeup_preempt_entity(second, left) < 1) 2935 if (second && wakeup_preempt_entity(second, left) < 1)
2759 se = second; 2936 se = second;
2760 } 2937 }
@@ -2776,7 +2953,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2776 return se; 2953 return se;
2777} 2954}
2778 2955
2779static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2956static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2780 2957
2781static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2958static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2782{ 2959{
@@ -3431,22 +3608,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3431} 3608}
3432 3609
3433/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3610/* conditionally throttle active cfs_rq's from put_prev_entity() */
3434static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3611static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3435{ 3612{
3436 if (!cfs_bandwidth_used()) 3613 if (!cfs_bandwidth_used())
3437 return; 3614 return false;
3438 3615
3439 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3616 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3440 return; 3617 return false;
3441 3618
3442 /* 3619 /*
3443 * it's possible for a throttled entity to be forced into a running 3620 * it's possible for a throttled entity to be forced into a running
3444 * state (e.g. set_curr_task), in this case we're finished. 3621 * state (e.g. set_curr_task), in this case we're finished.
3445 */ 3622 */
3446 if (cfs_rq_throttled(cfs_rq)) 3623 if (cfs_rq_throttled(cfs_rq))
3447 return; 3624 return true;
3448 3625
3449 throttle_cfs_rq(cfs_rq); 3626 throttle_cfs_rq(cfs_rq);
3627 return true;
3450} 3628}
3451 3629
3452static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3630static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3734,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3556} 3734}
3557 3735
3558static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3736static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3559static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3737static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3560static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3738static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3561static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3739static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3562 3740
@@ -4492,26 +4670,125 @@ preempt:
4492 set_last_buddy(se); 4670 set_last_buddy(se);
4493} 4671}
4494 4672
4495static struct task_struct *pick_next_task_fair(struct rq *rq) 4673static struct task_struct *
4674pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4496{ 4675{
4497 struct task_struct *p;
4498 struct cfs_rq *cfs_rq = &rq->cfs; 4676 struct cfs_rq *cfs_rq = &rq->cfs;
4499 struct sched_entity *se; 4677 struct sched_entity *se;
4678 struct task_struct *p;
4500 4679
4680again: __maybe_unused
4681#ifdef CONFIG_FAIR_GROUP_SCHED
4501 if (!cfs_rq->nr_running) 4682 if (!cfs_rq->nr_running)
4502 return NULL; 4683 goto idle;
4684
4685 if (!prev || prev->sched_class != &fair_sched_class)
4686 goto simple;
4687
4688 /*
4689 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4690 * likely that a next task is from the same cgroup as the current.
4691 *
4692 * Therefore attempt to avoid putting and setting the entire cgroup
4693 * hierarchy, only change the part that actually changes.
4694 */
4503 4695
4504 do { 4696 do {
4505 se = pick_next_entity(cfs_rq); 4697 struct sched_entity *curr = cfs_rq->curr;
4698
4699 /*
4700 * Since we got here without doing put_prev_entity() we also
4701 * have to consider cfs_rq->curr. If it is still a runnable
4702 * entity, update_curr() will update its vruntime, otherwise
4703 * forget we've ever seen it.
4704 */
4705 if (curr && curr->on_rq)
4706 update_curr(cfs_rq);
4707 else
4708 curr = NULL;
4709
4710 /*
4711 * This call to check_cfs_rq_runtime() will do the throttle and
4712 * dequeue its entity in the parent(s). Therefore the 'simple'
4713 * nr_running test will indeed be correct.
4714 */
4715 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4716 goto simple;
4717
4718 se = pick_next_entity(cfs_rq, curr);
4719 cfs_rq = group_cfs_rq(se);
4720 } while (cfs_rq);
4721
4722 p = task_of(se);
4723
4724 /*
4725 * Since we haven't yet done put_prev_entity and if the selected task
4726 * is a different task than we started out with, try and touch the
4727 * least amount of cfs_rqs.
4728 */
4729 if (prev != p) {
4730 struct sched_entity *pse = &prev->se;
4731
4732 while (!(cfs_rq = is_same_group(se, pse))) {
4733 int se_depth = se->depth;
4734 int pse_depth = pse->depth;
4735
4736 if (se_depth <= pse_depth) {
4737 put_prev_entity(cfs_rq_of(pse), pse);
4738 pse = parent_entity(pse);
4739 }
4740 if (se_depth >= pse_depth) {
4741 set_next_entity(cfs_rq_of(se), se);
4742 se = parent_entity(se);
4743 }
4744 }
4745
4746 put_prev_entity(cfs_rq, pse);
4747 set_next_entity(cfs_rq, se);
4748 }
4749
4750 if (hrtick_enabled(rq))
4751 hrtick_start_fair(rq, p);
4752
4753 return p;
4754simple:
4755 cfs_rq = &rq->cfs;
4756#endif
4757
4758 if (!cfs_rq->nr_running)
4759 goto idle;
4760
4761 if (prev)
4762 prev->sched_class->put_prev_task(rq, prev);
4763
4764 do {
4765 se = pick_next_entity(cfs_rq, NULL);
4506 set_next_entity(cfs_rq, se); 4766 set_next_entity(cfs_rq, se);
4507 cfs_rq = group_cfs_rq(se); 4767 cfs_rq = group_cfs_rq(se);
4508 } while (cfs_rq); 4768 } while (cfs_rq);
4509 4769
4510 p = task_of(se); 4770 p = task_of(se);
4771
4511 if (hrtick_enabled(rq)) 4772 if (hrtick_enabled(rq))
4512 hrtick_start_fair(rq, p); 4773 hrtick_start_fair(rq, p);
4513 4774
4514 return p; 4775 return p;
4776
4777idle:
4778#ifdef CONFIG_SMP
4779 idle_enter_fair(rq);
4780 /*
4781 * We must set idle_stamp _before_ calling idle_balance(), such that we
4782 * measure the duration of idle_balance() as idle time.
4783 */
4784 rq->idle_stamp = rq_clock(rq);
4785 if (idle_balance(rq)) { /* drops rq->lock */
4786 rq->idle_stamp = 0;
4787 goto again;
4788 }
4789#endif
4790
4791 return NULL;
4515} 4792}
4516 4793
4517/* 4794/*
@@ -4783,7 +5060,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4783{ 5060{
4784 int src_nid, dst_nid; 5061 int src_nid, dst_nid;
4785 5062
4786 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5063 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4787 !(env->sd->flags & SD_NUMA)) { 5064 !(env->sd->flags & SD_NUMA)) {
4788 return false; 5065 return false;
4789 } 5066 }
@@ -4814,7 +5091,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4814 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5091 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4815 return false; 5092 return false;
4816 5093
4817 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5094 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4818 return false; 5095 return false;
4819 5096
4820 src_nid = cpu_to_node(env->src_cpu); 5097 src_nid = cpu_to_node(env->src_cpu);
@@ -6357,17 +6634,16 @@ out:
6357 * idle_balance is called by schedule() if this_cpu is about to become 6634 * idle_balance is called by schedule() if this_cpu is about to become
6358 * idle. Attempts to pull tasks from other CPUs. 6635 * idle. Attempts to pull tasks from other CPUs.
6359 */ 6636 */
6360void idle_balance(int this_cpu, struct rq *this_rq) 6637int idle_balance(struct rq *this_rq)
6361{ 6638{
6362 struct sched_domain *sd; 6639 struct sched_domain *sd;
6363 int pulled_task = 0; 6640 int pulled_task = 0;
6364 unsigned long next_balance = jiffies + HZ; 6641 unsigned long next_balance = jiffies + HZ;
6365 u64 curr_cost = 0; 6642 u64 curr_cost = 0;
6366 6643 int this_cpu = this_rq->cpu;
6367 this_rq->idle_stamp = rq_clock(this_rq);
6368 6644
6369 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6645 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6370 return; 6646 return 0;
6371 6647
6372 /* 6648 /*
6373 * Drop the rq->lock, but keep IRQ/preempt disabled. 6649 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6681,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6405 interval = msecs_to_jiffies(sd->balance_interval); 6681 interval = msecs_to_jiffies(sd->balance_interval);
6406 if (time_after(next_balance, sd->last_balance + interval)) 6682 if (time_after(next_balance, sd->last_balance + interval))
6407 next_balance = sd->last_balance + interval; 6683 next_balance = sd->last_balance + interval;
6408 if (pulled_task) { 6684 if (pulled_task)
6409 this_rq->idle_stamp = 0;
6410 break; 6685 break;
6411 }
6412 } 6686 }
6413 rcu_read_unlock(); 6687 rcu_read_unlock();
6414 6688
6415 raw_spin_lock(&this_rq->lock); 6689 raw_spin_lock(&this_rq->lock);
6416 6690
6691 /*
6692 * While browsing the domains, we released the rq lock.
6693 * A task could have be enqueued in the meantime
6694 */
6695 if (this_rq->nr_running && !pulled_task)
6696 return 1;
6697
6417 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6698 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6418 /* 6699 /*
6419 * We are going idle. next_balance may be set based on 6700 * We are going idle. next_balance may be set based on
@@ -6424,6 +6705,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6424 6705
6425 if (curr_cost > this_rq->max_idle_balance_cost) 6706 if (curr_cost > this_rq->max_idle_balance_cost)
6426 this_rq->max_idle_balance_cost = curr_cost; 6707 this_rq->max_idle_balance_cost = curr_cost;
6708
6709 return pulled_task;
6427} 6710}
6428 6711
6429/* 6712/*
@@ -7082,7 +7365,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7082#ifdef CONFIG_FAIR_GROUP_SCHED 7365#ifdef CONFIG_FAIR_GROUP_SCHED
7083static void task_move_group_fair(struct task_struct *p, int on_rq) 7366static void task_move_group_fair(struct task_struct *p, int on_rq)
7084{ 7367{
7368 struct sched_entity *se = &p->se;
7085 struct cfs_rq *cfs_rq; 7369 struct cfs_rq *cfs_rq;
7370
7086 /* 7371 /*
7087 * If the task was not on the rq at the time of this cgroup movement 7372 * If the task was not on the rq at the time of this cgroup movement
7088 * it must have been asleep, sleeping tasks keep their ->vruntime 7373 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7393,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7108 * To prevent boost or penalty in the new cfs_rq caused by delta 7393 * To prevent boost or penalty in the new cfs_rq caused by delta
7109 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7394 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7110 */ 7395 */
7111 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7396 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7112 on_rq = 1; 7397 on_rq = 1;
7113 7398
7114 if (!on_rq) 7399 if (!on_rq)
7115 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7400 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7116 set_task_rq(p, task_cpu(p)); 7401 set_task_rq(p, task_cpu(p));
7402 se->depth = se->parent ? se->parent->depth + 1 : 0;
7117 if (!on_rq) { 7403 if (!on_rq) {
7118 cfs_rq = cfs_rq_of(&p->se); 7404 cfs_rq = cfs_rq_of(se);
7119 p->se.vruntime += cfs_rq->min_vruntime; 7405 se->vruntime += cfs_rq->min_vruntime;
7120#ifdef CONFIG_SMP 7406#ifdef CONFIG_SMP
7121 /* 7407 /*
7122 * migrate_task_rq_fair() will have removed our previous 7408 * migrate_task_rq_fair() will have removed our previous
7123 * contribution, but we must synchronize for ongoing future 7409 * contribution, but we must synchronize for ongoing future
7124 * decay. 7410 * decay.
7125 */ 7411 */
7126 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7412 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7127 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7413 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7128#endif 7414#endif
7129 } 7415 }
7130} 7416}
@@ -7220,10 +7506,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7220 if (!se) 7506 if (!se)
7221 return; 7507 return;
7222 7508
7223 if (!parent) 7509 if (!parent) {
7224 se->cfs_rq = &rq->cfs; 7510 se->cfs_rq = &rq->cfs;
7225 else 7511 se->depth = 0;
7512 } else {
7226 se->cfs_rq = parent->my_q; 7513 se->cfs_rq = parent->my_q;
7514 se->depth = parent->depth + 1;
7515 }
7227 7516
7228 se->my_q = cfs_rq; 7517 se->my_q = cfs_rq;
7229 /* guarantee group entities always have weight */ 7518 /* guarantee group entities always have weight */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..14ca43430aee
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,144 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
7#include <linux/tick.h>
8#include <linux/mm.h>
9#include <linux/stackprotector.h>
10
11#include <asm/tlb.h>
12
13#include <trace/events/power.h>
14
15static int __read_mostly cpu_idle_force_poll;
16
17void cpu_idle_poll_ctrl(bool enable)
18{
19 if (enable) {
20 cpu_idle_force_poll++;
21 } else {
22 cpu_idle_force_poll--;
23 WARN_ON_ONCE(cpu_idle_force_poll < 0);
24 }
25}
26
27#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
28static int __init cpu_idle_poll_setup(char *__unused)
29{
30 cpu_idle_force_poll = 1;
31 return 1;
32}
33__setup("nohlt", cpu_idle_poll_setup);
34
35static int __init cpu_idle_nopoll_setup(char *__unused)
36{
37 cpu_idle_force_poll = 0;
38 return 1;
39}
40__setup("hlt", cpu_idle_nopoll_setup);
41#endif
42
43static inline int cpu_idle_poll(void)
44{
45 rcu_idle_enter();
46 trace_cpu_idle_rcuidle(0, smp_processor_id());
47 local_irq_enable();
48 while (!tif_need_resched())
49 cpu_relax();
50 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
51 rcu_idle_exit();
52 return 1;
53}
54
55/* Weak implementations for optional arch specific functions */
56void __weak arch_cpu_idle_prepare(void) { }
57void __weak arch_cpu_idle_enter(void) { }
58void __weak arch_cpu_idle_exit(void) { }
59void __weak arch_cpu_idle_dead(void) { }
60void __weak arch_cpu_idle(void)
61{
62 cpu_idle_force_poll = 1;
63 local_irq_enable();
64}
65
66/*
67 * Generic idle loop implementation
68 */
69static void cpu_idle_loop(void)
70{
71 while (1) {
72 tick_nohz_idle_enter();
73
74 while (!need_resched()) {
75 check_pgt_cache();
76 rmb();
77
78 if (cpu_is_offline(smp_processor_id()))
79 arch_cpu_idle_dead();
80
81 local_irq_disable();
82 arch_cpu_idle_enter();
83
84 /*
85 * In poll mode we reenable interrupts and spin.
86 *
87 * Also if we detected in the wakeup from idle
88 * path that the tick broadcast device expired
89 * for us, we don't want to go deep idle as we
90 * know that the IPI is going to arrive right
91 * away
92 */
93 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
94 cpu_idle_poll();
95 } else {
96 if (!current_clr_polling_and_test()) {
97 stop_critical_timings();
98 rcu_idle_enter();
99 if (cpuidle_idle_call())
100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
103 rcu_idle_exit();
104 start_critical_timings();
105 } else {
106 local_irq_enable();
107 }
108 __current_set_polling();
109 }
110 arch_cpu_idle_exit();
111 /*
112 * We need to test and propagate the TIF_NEED_RESCHED
113 * bit here because we might not have send the
114 * reschedule IPI to idle tasks.
115 */
116 if (tif_need_resched())
117 set_preempt_need_resched();
118 }
119 tick_nohz_idle_exit();
120 schedule_preempt_disabled();
121 }
122}
123
124void cpu_startup_entry(enum cpuhp_state state)
125{
126 /*
127 * This #ifdef needs to die, but it's too late in the cycle to
128 * make this generic (arm and sh have never invoked the canary
129 * init for the non boot cpus!). Will be fixed in 3.11
130 */
131#ifdef CONFIG_X86
132 /*
133 * If we're the non-boot CPU, nothing set the stack canary up
134 * for us. The boot CPU already has it initialized but no harm
135 * in doing it again. This is a good place for updating it, as
136 * we wont ever return from this function (so the invalid
137 * canaries already on the stack wont ever trigger).
138 */
139 boot_init_stack_canary();
140#endif
141 __current_set_polling();
142 arch_cpu_idle_prepare();
143 cpu_idle_loop();
144}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..f7d03af79a5b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,12 +23,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 if (prev)
30 prev->sched_class->put_prev_task(rq, prev);
31
38 schedstat_inc(rq, sched_goidle); 32 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP 33#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */ 34 idle_enter_fair(rq);
41 rq->post_schedule = 1;
42#endif 35#endif
43 return rq->idle; 36 return rq->idle;
44} 37}
@@ -58,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 51
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 52static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 53{
54#ifdef CONFIG_SMP
55 idle_exit_fair(rq);
56 rq_last_tick_reset(rq);
57#endif
61} 58}
62 59
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 60static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +98,6 @@ const struct sched_class idle_sched_class = {
101 98
102#ifdef CONFIG_SMP 99#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 100 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 101#endif
107 102
108 .set_curr_task = set_curr_task_idle, 103 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..72f9ec759972 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
232static inline int rt_overloaded(struct rq *rq) 234static inline int rt_overloaded(struct rq *rq)
233{ 235{
234 return atomic_read(&rq->rd->rto_count); 236 return atomic_read(&rq->rd->rto_count);
@@ -1310,15 +1312,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1310{ 1312{
1311 struct sched_rt_entity *rt_se; 1313 struct sched_rt_entity *rt_se;
1312 struct task_struct *p; 1314 struct task_struct *p;
1313 struct rt_rq *rt_rq; 1315 struct rt_rq *rt_rq = &rq->rt;
1314
1315 rt_rq = &rq->rt;
1316
1317 if (!rt_rq->rt_nr_running)
1318 return NULL;
1319
1320 if (rt_rq_throttled(rt_rq))
1321 return NULL;
1322 1316
1323 do { 1317 do {
1324 rt_se = pick_next_rt_entity(rq, rt_rq); 1318 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1326,28 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1332 return p; 1326 return p;
1333} 1327}
1334 1328
1335static struct task_struct *pick_next_task_rt(struct rq *rq) 1329static struct task_struct *
1330pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1336{ 1331{
1337 struct task_struct *p = _pick_next_task_rt(rq); 1332 struct task_struct *p;
1333 struct rt_rq *rt_rq = &rq->rt;
1334
1335#ifdef CONFIG_SMP
1336 /* Try to pull RT tasks here if we lower this rq's prio */
1337 if (rq->rt.highest_prio.curr > prev->prio)
1338 pull_rt_task(rq);
1339#endif
1340
1341 if (!rt_rq->rt_nr_running)
1342 return NULL;
1343
1344 if (rt_rq_throttled(rt_rq))
1345 return NULL;
1346
1347 if (prev)
1348 prev->sched_class->put_prev_task(rq, prev);
1349
1350 p = _pick_next_task_rt(rq);
1338 1351
1339 /* The running task is never eligible for pushing */ 1352 /* The running task is never eligible for pushing */
1340 if (p) 1353 if (p)
@@ -1716,13 +1729,6 @@ skip:
1716 return ret; 1729 return ret;
1717} 1730}
1718 1731
1719static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1720{
1721 /* Try to pull RT tasks here if we lower this rq's prio */
1722 if (rq->rt.highest_prio.curr > prev->prio)
1723 pull_rt_task(rq);
1724}
1725
1726static void post_schedule_rt(struct rq *rq) 1732static void post_schedule_rt(struct rq *rq)
1727{ 1733{
1728 push_rt_tasks(rq); 1734 push_rt_tasks(rq);
@@ -1999,7 +2005,6 @@ const struct sched_class rt_sched_class = {
1999 .set_cpus_allowed = set_cpus_allowed_rt, 2005 .set_cpus_allowed = set_cpus_allowed_rt,
2000 .rq_online = rq_online_rt, 2006 .rq_online = rq_online_rt,
2001 .rq_offline = rq_offline_rt, 2007 .rq_offline = rq_offline_rt,
2002 .pre_schedule = pre_schedule_rt,
2003 .post_schedule = post_schedule_rt, 2008 .post_schedule = post_schedule_rt,
2004 .task_woken = task_woken_rt, 2009 .task_woken = task_woken_rt,
2005 .switched_from = switched_from_rt, 2010 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..1bf34c257d3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -1123,14 +1105,19 @@ struct sched_class {
1123 1105
1124 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1106 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1125 1107
1126 struct task_struct * (*pick_next_task) (struct rq *rq); 1108 /*
1109 * It is the responsibility of the pick_next_task() method that will
1110 * return the next task to call put_prev_task() on the @prev task or
1111 * something equivalent.
1112 */
1113 struct task_struct * (*pick_next_task) (struct rq *rq,
1114 struct task_struct *prev);
1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1115 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1128 1116
1129#ifdef CONFIG_SMP 1117#ifdef CONFIG_SMP
1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1118 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1119 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1132 1120
1133 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1134 void (*post_schedule) (struct rq *this_rq); 1121 void (*post_schedule) (struct rq *this_rq);
1135 void (*task_waking) (struct task_struct *task); 1122 void (*task_waking) (struct task_struct *task);
1136 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1123 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1176,7 +1163,7 @@ extern const struct sched_class idle_sched_class;
1176extern void update_group_power(struct sched_domain *sd, int cpu); 1163extern void update_group_power(struct sched_domain *sd, int cpu);
1177 1164
1178extern void trigger_load_balance(struct rq *rq); 1165extern void trigger_load_balance(struct rq *rq);
1179extern void idle_balance(int this_cpu, struct rq *this_rq); 1166extern int idle_balance(struct rq *this_rq);
1180 1167
1181extern void idle_enter_fair(struct rq *this_rq); 1168extern void idle_enter_fair(struct rq *this_rq);
1182extern void idle_exit_fair(struct rq *this_rq); 1169extern void idle_exit_fair(struct rq *this_rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..a4147c9d2017 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 if (prev)
35 prev->sched_class->put_prev_task(rq, prev);
36
37 stop->se.exec_start = rq_clock_task(rq);
38
39 return stop;
36} 40}
37 41
38static void 42static void
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
387 }, 387 },
388 { 388 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 389 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 390 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ae3c8f3595d4..f520b9da9c1f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n)
2301 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2302} 2302}
2303 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2333/** 2304/**
2334 * mpol_misplaced - check whether current page node is valid in policy 2305 * mpol_misplaced - check whether current page node is valid in policy
2335 * 2306 *
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2403 2374
2404 /* Migrate the page towards the node whose CPU is referencing it */ 2375 /* Migrate the page towards the node whose CPU is referencing it */
2405 if (pol->flags & MPOL_F_MORON) { 2376 if (pol->flags & MPOL_F_MORON) {
2406 int last_cpupid;
2407 int this_cpupid;
2408
2409 polnid = thisnid; 2377 polnid = thisnid;
2410 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2411
2412 /*
2413 * Multi-stage node selection is used in conjunction
2414 * with a periodic migration fault to build a temporal
2415 * task<->page relation. By using a two-stage filter we
2416 * remove short/unlikely relations.
2417 *
2418 * Using P(p) ~ n_p / n_t as per frequentist
2419 * probability, we can equate a task's usage of a
2420 * particular page (n_p) per total usage of this
2421 * page (n_t) (in a given time-span) to a probability.
2422 *
2423 * Our periodic faults will sample this probability and
2424 * getting the same result twice in a row, given these
2425 * samples are fully independent, is then given by
2426 * P(n)^2, provided our sample period is sufficiently
2427 * short compared to the usage pattern.
2428 *
2429 * This quadric squishes small probabilities, making
2430 * it less likely we act on an unlikely task<->page
2431 * relation.
2432 */
2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435 2378
2436 /* See sysctl_numa_balancing_migrate_deferred comment */ 2379 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2452 goto out; 2380 goto out;
2453 } 2381 }
2454 2382