aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c7
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c79
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c22
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c503
-rw-r--r--kernel/sched/idle.c144
-rw-r--r--kernel/sched/idle_task.c27
-rw-r--r--kernel/sched/rt.c43
-rw-r--r--kernel/sched/sched.h29
-rw-r--r--kernel/sched/stop_task.c16
-rw-r--r--kernel/sysctl.c7
15 files changed, 643 insertions, 249 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..6f1c7e5cfca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
22obj-y += locking/ 22obj-y += locking/
23obj-y += power/ 23obj-y += power/
24obj-y += printk/ 24obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 25obj-y += irq/
27obj-y += rcu/ 26obj-y += rcu/
28 27
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
6#include <linux/tick.h> 7#include <linux/tick.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
95 if (!current_clr_polling_and_test()) { 96 if (!current_clr_polling_and_test()) {
96 stop_critical_timings(); 97 stop_critical_timings();
97 rcu_idle_enter(); 98 rcu_idle_enter();
98 arch_cpu_idle(); 99 if (cpuidle_idle_call())
99 WARN_ON_ONCE(irqs_disabled()); 100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
100 rcu_idle_exit(); 103 rcu_idle_exit();
101 start_critical_timings(); 104 start_critical_timings();
102 } else { 105 } else {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..fb9764fbc537 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1747 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1748 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1749 p->numa_faults_buffer_memory = NULL;
1750 p->last_task_numa_placement = 0;
1751 p->last_sum_exec_runtime = 0;
1750 1752
1751 INIT_LIST_HEAD(&p->numa_entry); 1753 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1754 p->numa_group = NULL;
@@ -2167,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2169
2168#ifdef CONFIG_SMP 2170#ifdef CONFIG_SMP
2169 2171
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2172/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2173static inline void post_schedule(struct rq *rq)
2179{ 2174{
@@ -2191,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)
2191 2186
2192#else 2187#else
2193 2188
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2189static inline void post_schedule(struct rq *rq)
2199{ 2190{
2200} 2191}
@@ -2577,18 +2568,11 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2568 schedstat_inc(this_rq(), sched_count);
2578} 2569}
2579 2570
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2571/*
2588 * Pick up the highest-prio task: 2572 * Pick up the highest-prio task:
2589 */ 2573 */
2590static inline struct task_struct * 2574static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2575pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2576{
2593 const struct sched_class *class; 2577 const struct sched_class *class;
2594 struct task_struct *p; 2578 struct task_struct *p;
@@ -2597,14 +2581,15 @@ pick_next_task(struct rq *rq)
2597 * Optimization: we know that if all tasks are in 2581 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2582 * the fair class we can call that function directly:
2599 */ 2583 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2584 if (likely(prev->sched_class == &fair_sched_class &&
2601 p = fair_sched_class.pick_next_task(rq); 2585 rq->nr_running == rq->cfs.h_nr_running)) {
2586 p = fair_sched_class.pick_next_task(rq, prev);
2602 if (likely(p)) 2587 if (likely(p))
2603 return p; 2588 return p;
2604 } 2589 }
2605 2590
2606 for_each_class(class) { 2591 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2592 p = class->pick_next_task(rq, prev);
2608 if (p) 2593 if (p)
2609 return p; 2594 return p;
2610 } 2595 }
@@ -2700,13 +2685,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2685 switch_count = &prev->nvcsw;
2701 } 2686 }
2702 2687
2703 pre_schedule(rq, prev); 2688 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2689 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2690
2708 put_prev_task(rq, prev); 2691 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2692 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2693 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2694 rq->skip_clock_update = 0;
@@ -2998,7 +2980,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2980 unsigned long flags;
2999 struct rq *rq; 2981 struct rq *rq;
3000 2982
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2983 if (task_nice(p) == nice || nice < -20 || nice > 19)
3002 return; 2984 return;
3003 /* 2985 /*
3004 * We have to be careful, if called from sys_setpriority(), 2986 * We have to be careful, if called from sys_setpriority(),
@@ -3076,7 +3058,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3058 if (increment > 40)
3077 increment = 40; 3059 increment = 40;
3078 3060
3079 nice = TASK_NICE(current) + increment; 3061 nice = task_nice(current) + increment;
3080 if (nice < -20) 3062 if (nice < -20)
3081 nice = -20; 3063 nice = -20;
3082 if (nice > 19) 3064 if (nice > 19)
@@ -3109,18 +3091,6 @@ int task_prio(const struct task_struct *p)
3109} 3091}
3110 3092
3111/** 3093/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3094 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3095 * @cpu: the processor in question.
3126 * 3096 *
@@ -3319,7 +3289,7 @@ recheck:
3319 */ 3289 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3290 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3291 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3292 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3293 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3294 return -EPERM;
3325 } 3295 }
@@ -3343,7 +3313,7 @@ recheck:
3343 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3313 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3344 */ 3314 */
3345 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3315 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3346 if (!can_nice(p, TASK_NICE(p))) 3316 if (!can_nice(p, task_nice(p)))
3347 return -EPERM; 3317 return -EPERM;
3348 } 3318 }
3349 3319
@@ -3383,7 +3353,7 @@ recheck:
3383 * If not changing anything there's no need to proceed further: 3353 * If not changing anything there's no need to proceed further:
3384 */ 3354 */
3385 if (unlikely(policy == p->policy)) { 3355 if (unlikely(policy == p->policy)) {
3386 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3356 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3387 goto change; 3357 goto change;
3388 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3358 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3389 goto change; 3359 goto change;
@@ -3835,7 +3805,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3835 else if (task_has_rt_policy(p)) 3805 else if (task_has_rt_policy(p))
3836 attr.sched_priority = p->rt_priority; 3806 attr.sched_priority = p->rt_priority;
3837 else 3807 else
3838 attr.sched_nice = TASK_NICE(p); 3808 attr.sched_nice = task_nice(p);
3839 3809
3840 rcu_read_unlock(); 3810 rcu_read_unlock();
3841 3811
@@ -4751,7 +4721,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4751 if (rq->nr_running == 1) 4721 if (rq->nr_running == 1)
4752 break; 4722 break;
4753 4723
4754 next = pick_next_task(rq); 4724 next = pick_next_task(rq, NULL);
4755 BUG_ON(!next); 4725 BUG_ON(!next);
4756 next->sched_class->put_prev_task(rq, next); 4726 next->sched_class->put_prev_task(rq, next);
4757 4727
@@ -4841,7 +4811,7 @@ set_table_entry(struct ctl_table *entry,
4841static struct ctl_table * 4811static struct ctl_table *
4842sd_alloc_ctl_domain_table(struct sched_domain *sd) 4812sd_alloc_ctl_domain_table(struct sched_domain *sd)
4843{ 4813{
4844 struct ctl_table *table = sd_alloc_ctl_entry(13); 4814 struct ctl_table *table = sd_alloc_ctl_entry(14);
4845 4815
4846 if (table == NULL) 4816 if (table == NULL)
4847 return NULL; 4817 return NULL;
@@ -4869,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4869 sizeof(int), 0644, proc_dointvec_minmax, false); 4839 sizeof(int), 0644, proc_dointvec_minmax, false);
4870 set_table_entry(&table[10], "flags", &sd->flags, 4840 set_table_entry(&table[10], "flags", &sd->flags,
4871 sizeof(int), 0644, proc_dointvec_minmax, false); 4841 sizeof(int), 0644, proc_dointvec_minmax, false);
4872 set_table_entry(&table[11], "name", sd->name, 4842 set_table_entry(&table[11], "max_newidle_lb_cost",
4843 &sd->max_newidle_lb_cost,
4844 sizeof(long), 0644, proc_doulongvec_minmax, false);
4845 set_table_entry(&table[12], "name", sd->name,
4873 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4846 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4874 /* &table[12] is terminator */ 4847 /* &table[13] is terminator */
4875 4848
4876 return table; 4849 return table;
4877} 4850}
@@ -7008,7 +6981,7 @@ void normalize_rt_tasks(void)
7008 * Renice negative nice level userspace 6981 * Renice negative nice level userspace
7009 * tasks back to 0: 6982 * tasks back to 0:
7010 */ 6983 */
7011 if (TASK_NICE(p) < 0 && p->mm) 6984 if (task_nice(p) < 0 && p->mm)
7012 set_user_nice(p, 0); 6985 set_user_nice(p, 0);
7013 continue; 6986 continue;
7014 } 6987 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..ed31ef66ab9d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
944 resched_task(rq->curr); 944 resched_task(rq->curr);
945} 945}
946 946
947static int pull_dl_task(struct rq *this_rq);
948
947#endif /* CONFIG_SMP */ 949#endif /* CONFIG_SMP */
948 950
949/* 951/*
@@ -990,7 +992,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
990 return rb_entry(left, struct sched_dl_entity, rb_node); 992 return rb_entry(left, struct sched_dl_entity, rb_node);
991} 993}
992 994
993struct task_struct *pick_next_task_dl(struct rq *rq) 995struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
994{ 996{
995 struct sched_dl_entity *dl_se; 997 struct sched_dl_entity *dl_se;
996 struct task_struct *p; 998 struct task_struct *p;
@@ -998,9 +1000,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
998 1000
999 dl_rq = &rq->dl; 1001 dl_rq = &rq->dl;
1000 1002
1003#ifdef CONFIG_SMP
1004 if (dl_task(prev))
1005 pull_dl_task(rq);
1006#endif
1007
1001 if (unlikely(!dl_rq->dl_nr_running)) 1008 if (unlikely(!dl_rq->dl_nr_running))
1002 return NULL; 1009 return NULL;
1003 1010
1011 if (prev)
1012 prev->sched_class->put_prev_task(rq, prev);
1013
1004 dl_se = pick_next_dl_entity(rq, dl_rq); 1014 dl_se = pick_next_dl_entity(rq, dl_rq);
1005 BUG_ON(!dl_se); 1015 BUG_ON(!dl_se);
1006 1016
@@ -1426,13 +1436,6 @@ skip:
1426 return ret; 1436 return ret;
1427} 1437}
1428 1438
1429static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1430{
1431 /* Try to pull other tasks here */
1432 if (dl_task(prev))
1433 pull_dl_task(rq);
1434}
1435
1436static void post_schedule_dl(struct rq *rq) 1439static void post_schedule_dl(struct rq *rq)
1437{ 1440{
1438 push_dl_tasks(rq); 1441 push_dl_tasks(rq);
@@ -1560,7 +1563,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1560 if (unlikely(p->dl.dl_throttled)) 1563 if (unlikely(p->dl.dl_throttled))
1561 return; 1564 return;
1562 1565
1563 if (p->on_rq || rq->curr != p) { 1566 if (p->on_rq && rq->curr != p) {
1564#ifdef CONFIG_SMP 1567#ifdef CONFIG_SMP
1565 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1568 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1566 /* Only reschedule if pushing failed */ 1569 /* Only reschedule if pushing failed */
@@ -1625,7 +1628,6 @@ const struct sched_class dl_sched_class = {
1625 .set_cpus_allowed = set_cpus_allowed_dl, 1628 .set_cpus_allowed = set_cpus_allowed_dl,
1626 .rq_online = rq_online_dl, 1629 .rq_online = rq_online_dl,
1627 .rq_offline = rq_offline_dl, 1630 .rq_offline = rq_offline_dl,
1628 .pre_schedule = pre_schedule_dl,
1629 .post_schedule = post_schedule_dl, 1631 .post_schedule = post_schedule_dl,
1630 .task_woken = task_woken_dl, 1632 .task_woken = task_woken_dl,
1631#endif 1633#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
321 P(sched_goidle); 321 P(sched_goidle);
322#ifdef CONFIG_SMP 322#ifdef CONFIG_SMP
323 P64(avg_idle); 323 P64(avg_idle);
324 P64(max_idle_balance_cost);
324#endif 325#endif
325 326
326 P(ttwu_count); 327 P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 534 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 535 int cpu_current, home_node;
535 536
536 if (p->numa_faults) 537 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 538 nr_faults = p->numa_faults_memory[2*node + i];
538 539
539 cpu_current = !i ? (task_node(p) == node) : 540 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 541 (pol && node_isset(node, pol->v.nodes));
541 542
542 home_node = (p->numa_preferred_nid == node); 543 home_node = (p->numa_preferred_nid == node);
543 544
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 545 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 546 i, node, cpu_current, home_node, nr_faults);
546 } 547 }
547 } 548 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..235cfa7ad8fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -2414,7 +2571,8 @@ void idle_exit_fair(struct rq *this_rq)
2414 update_rq_runnable_avg(this_rq, 0); 2571 update_rq_runnable_avg(this_rq, 0);
2415} 2572}
2416 2573
2417#else 2574#else /* CONFIG_SMP */
2575
2418static inline void update_entity_load_avg(struct sched_entity *se, 2576static inline void update_entity_load_avg(struct sched_entity *se,
2419 int update_cfs_rq) {} 2577 int update_cfs_rq) {}
2420static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2578static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2584,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2426 int sleep) {} 2584 int sleep) {}
2427static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2585static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2428 int force_update) {} 2586 int force_update) {}
2429#endif 2587#endif /* CONFIG_SMP */
2430 2588
2431static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2589static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2432{ 2590{
@@ -2576,10 +2734,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2576{ 2734{
2577 for_each_sched_entity(se) { 2735 for_each_sched_entity(se) {
2578 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2736 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2579 if (cfs_rq->last == se) 2737 if (cfs_rq->last != se)
2580 cfs_rq->last = NULL;
2581 else
2582 break; 2738 break;
2739
2740 cfs_rq->last = NULL;
2583 } 2741 }
2584} 2742}
2585 2743
@@ -2587,10 +2745,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2587{ 2745{
2588 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2589 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2590 if (cfs_rq->next == se) 2748 if (cfs_rq->next != se)
2591 cfs_rq->next = NULL;
2592 else
2593 break; 2749 break;
2750
2751 cfs_rq->next = NULL;
2594 } 2752 }
2595} 2753}
2596 2754
@@ -2598,10 +2756,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2598{ 2756{
2599 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2600 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2601 if (cfs_rq->skip == se) 2759 if (cfs_rq->skip != se)
2602 cfs_rq->skip = NULL;
2603 else
2604 break; 2760 break;
2761
2762 cfs_rq->skip = NULL;
2605 } 2763 }
2606} 2764}
2607 2765
@@ -2744,17 +2902,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2744 * 3) pick the "last" process, for cache locality 2902 * 3) pick the "last" process, for cache locality
2745 * 4) do not run the "skip" process, if something else is available 2903 * 4) do not run the "skip" process, if something else is available
2746 */ 2904 */
2747static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2905static struct sched_entity *
2906pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2748{ 2907{
2749 struct sched_entity *se = __pick_first_entity(cfs_rq); 2908 struct sched_entity *left = __pick_first_entity(cfs_rq);
2750 struct sched_entity *left = se; 2909 struct sched_entity *se;
2910
2911 /*
2912 * If curr is set we have to see if its left of the leftmost entity
2913 * still in the tree, provided there was anything in the tree at all.
2914 */
2915 if (!left || (curr && entity_before(curr, left)))
2916 left = curr;
2917
2918 se = left; /* ideally we run the leftmost entity */
2751 2919
2752 /* 2920 /*
2753 * Avoid running the skip buddy, if running something else can 2921 * Avoid running the skip buddy, if running something else can
2754 * be done without getting too unfair. 2922 * be done without getting too unfair.
2755 */ 2923 */
2756 if (cfs_rq->skip == se) { 2924 if (cfs_rq->skip == se) {
2757 struct sched_entity *second = __pick_next_entity(se); 2925 struct sched_entity *second;
2926
2927 if (se == curr) {
2928 second = __pick_first_entity(cfs_rq);
2929 } else {
2930 second = __pick_next_entity(se);
2931 if (!second || (curr && entity_before(curr, second)))
2932 second = curr;
2933 }
2934
2758 if (second && wakeup_preempt_entity(second, left) < 1) 2935 if (second && wakeup_preempt_entity(second, left) < 1)
2759 se = second; 2936 se = second;
2760 } 2937 }
@@ -2776,7 +2953,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2776 return se; 2953 return se;
2777} 2954}
2778 2955
2779static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2956static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2780 2957
2781static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2958static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2782{ 2959{
@@ -3431,22 +3608,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3431} 3608}
3432 3609
3433/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3610/* conditionally throttle active cfs_rq's from put_prev_entity() */
3434static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3611static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3435{ 3612{
3436 if (!cfs_bandwidth_used()) 3613 if (!cfs_bandwidth_used())
3437 return; 3614 return false;
3438 3615
3439 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3616 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3440 return; 3617 return false;
3441 3618
3442 /* 3619 /*
3443 * it's possible for a throttled entity to be forced into a running 3620 * it's possible for a throttled entity to be forced into a running
3444 * state (e.g. set_curr_task), in this case we're finished. 3621 * state (e.g. set_curr_task), in this case we're finished.
3445 */ 3622 */
3446 if (cfs_rq_throttled(cfs_rq)) 3623 if (cfs_rq_throttled(cfs_rq))
3447 return; 3624 return true;
3448 3625
3449 throttle_cfs_rq(cfs_rq); 3626 throttle_cfs_rq(cfs_rq);
3627 return true;
3450} 3628}
3451 3629
3452static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3630static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3734,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3556} 3734}
3557 3735
3558static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3736static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3559static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3737static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3560static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3738static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3561static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3739static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3562 3740
@@ -4492,26 +4670,125 @@ preempt:
4492 set_last_buddy(se); 4670 set_last_buddy(se);
4493} 4671}
4494 4672
4495static struct task_struct *pick_next_task_fair(struct rq *rq) 4673static struct task_struct *
4674pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4496{ 4675{
4497 struct task_struct *p;
4498 struct cfs_rq *cfs_rq = &rq->cfs; 4676 struct cfs_rq *cfs_rq = &rq->cfs;
4499 struct sched_entity *se; 4677 struct sched_entity *se;
4678 struct task_struct *p;
4500 4679
4680again: __maybe_unused
4681#ifdef CONFIG_FAIR_GROUP_SCHED
4501 if (!cfs_rq->nr_running) 4682 if (!cfs_rq->nr_running)
4502 return NULL; 4683 goto idle;
4684
4685 if (!prev || prev->sched_class != &fair_sched_class)
4686 goto simple;
4687
4688 /*
4689 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4690 * likely that a next task is from the same cgroup as the current.
4691 *
4692 * Therefore attempt to avoid putting and setting the entire cgroup
4693 * hierarchy, only change the part that actually changes.
4694 */
4503 4695
4504 do { 4696 do {
4505 se = pick_next_entity(cfs_rq); 4697 struct sched_entity *curr = cfs_rq->curr;
4698
4699 /*
4700 * Since we got here without doing put_prev_entity() we also
4701 * have to consider cfs_rq->curr. If it is still a runnable
4702 * entity, update_curr() will update its vruntime, otherwise
4703 * forget we've ever seen it.
4704 */
4705 if (curr && curr->on_rq)
4706 update_curr(cfs_rq);
4707 else
4708 curr = NULL;
4709
4710 /*
4711 * This call to check_cfs_rq_runtime() will do the throttle and
4712 * dequeue its entity in the parent(s). Therefore the 'simple'
4713 * nr_running test will indeed be correct.
4714 */
4715 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4716 goto simple;
4717
4718 se = pick_next_entity(cfs_rq, curr);
4719 cfs_rq = group_cfs_rq(se);
4720 } while (cfs_rq);
4721
4722 p = task_of(se);
4723
4724 /*
4725 * Since we haven't yet done put_prev_entity and if the selected task
4726 * is a different task than we started out with, try and touch the
4727 * least amount of cfs_rqs.
4728 */
4729 if (prev != p) {
4730 struct sched_entity *pse = &prev->se;
4731
4732 while (!(cfs_rq = is_same_group(se, pse))) {
4733 int se_depth = se->depth;
4734 int pse_depth = pse->depth;
4735
4736 if (se_depth <= pse_depth) {
4737 put_prev_entity(cfs_rq_of(pse), pse);
4738 pse = parent_entity(pse);
4739 }
4740 if (se_depth >= pse_depth) {
4741 set_next_entity(cfs_rq_of(se), se);
4742 se = parent_entity(se);
4743 }
4744 }
4745
4746 put_prev_entity(cfs_rq, pse);
4747 set_next_entity(cfs_rq, se);
4748 }
4749
4750 if (hrtick_enabled(rq))
4751 hrtick_start_fair(rq, p);
4752
4753 return p;
4754simple:
4755 cfs_rq = &rq->cfs;
4756#endif
4757
4758 if (!cfs_rq->nr_running)
4759 goto idle;
4760
4761 if (prev)
4762 prev->sched_class->put_prev_task(rq, prev);
4763
4764 do {
4765 se = pick_next_entity(cfs_rq, NULL);
4506 set_next_entity(cfs_rq, se); 4766 set_next_entity(cfs_rq, se);
4507 cfs_rq = group_cfs_rq(se); 4767 cfs_rq = group_cfs_rq(se);
4508 } while (cfs_rq); 4768 } while (cfs_rq);
4509 4769
4510 p = task_of(se); 4770 p = task_of(se);
4771
4511 if (hrtick_enabled(rq)) 4772 if (hrtick_enabled(rq))
4512 hrtick_start_fair(rq, p); 4773 hrtick_start_fair(rq, p);
4513 4774
4514 return p; 4775 return p;
4776
4777idle:
4778#ifdef CONFIG_SMP
4779 idle_enter_fair(rq);
4780 /*
4781 * We must set idle_stamp _before_ calling idle_balance(), such that we
4782 * measure the duration of idle_balance() as idle time.
4783 */
4784 rq->idle_stamp = rq_clock(rq);
4785 if (idle_balance(rq)) { /* drops rq->lock */
4786 rq->idle_stamp = 0;
4787 goto again;
4788 }
4789#endif
4790
4791 return NULL;
4515} 4792}
4516 4793
4517/* 4794/*
@@ -4783,7 +5060,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4783{ 5060{
4784 int src_nid, dst_nid; 5061 int src_nid, dst_nid;
4785 5062
4786 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5063 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4787 !(env->sd->flags & SD_NUMA)) { 5064 !(env->sd->flags & SD_NUMA)) {
4788 return false; 5065 return false;
4789 } 5066 }
@@ -4814,7 +5091,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4814 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5091 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4815 return false; 5092 return false;
4816 5093
4817 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5094 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4818 return false; 5095 return false;
4819 5096
4820 src_nid = cpu_to_node(env->src_cpu); 5097 src_nid = cpu_to_node(env->src_cpu);
@@ -6357,17 +6634,16 @@ out:
6357 * idle_balance is called by schedule() if this_cpu is about to become 6634 * idle_balance is called by schedule() if this_cpu is about to become
6358 * idle. Attempts to pull tasks from other CPUs. 6635 * idle. Attempts to pull tasks from other CPUs.
6359 */ 6636 */
6360void idle_balance(int this_cpu, struct rq *this_rq) 6637int idle_balance(struct rq *this_rq)
6361{ 6638{
6362 struct sched_domain *sd; 6639 struct sched_domain *sd;
6363 int pulled_task = 0; 6640 int pulled_task = 0;
6364 unsigned long next_balance = jiffies + HZ; 6641 unsigned long next_balance = jiffies + HZ;
6365 u64 curr_cost = 0; 6642 u64 curr_cost = 0;
6366 6643 int this_cpu = this_rq->cpu;
6367 this_rq->idle_stamp = rq_clock(this_rq);
6368 6644
6369 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6645 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6370 return; 6646 return 0;
6371 6647
6372 /* 6648 /*
6373 * Drop the rq->lock, but keep IRQ/preempt disabled. 6649 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6681,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6405 interval = msecs_to_jiffies(sd->balance_interval); 6681 interval = msecs_to_jiffies(sd->balance_interval);
6406 if (time_after(next_balance, sd->last_balance + interval)) 6682 if (time_after(next_balance, sd->last_balance + interval))
6407 next_balance = sd->last_balance + interval; 6683 next_balance = sd->last_balance + interval;
6408 if (pulled_task) { 6684 if (pulled_task)
6409 this_rq->idle_stamp = 0;
6410 break; 6685 break;
6411 }
6412 } 6686 }
6413 rcu_read_unlock(); 6687 rcu_read_unlock();
6414 6688
6415 raw_spin_lock(&this_rq->lock); 6689 raw_spin_lock(&this_rq->lock);
6416 6690
6691 /*
6692 * While browsing the domains, we released the rq lock.
6693 * A task could have be enqueued in the meantime
6694 */
6695 if (this_rq->nr_running && !pulled_task)
6696 return 1;
6697
6417 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6698 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6418 /* 6699 /*
6419 * We are going idle. next_balance may be set based on 6700 * We are going idle. next_balance may be set based on
@@ -6424,6 +6705,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6424 6705
6425 if (curr_cost > this_rq->max_idle_balance_cost) 6706 if (curr_cost > this_rq->max_idle_balance_cost)
6426 this_rq->max_idle_balance_cost = curr_cost; 6707 this_rq->max_idle_balance_cost = curr_cost;
6708
6709 return pulled_task;
6427} 6710}
6428 6711
6429/* 6712/*
@@ -7082,7 +7365,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7082#ifdef CONFIG_FAIR_GROUP_SCHED 7365#ifdef CONFIG_FAIR_GROUP_SCHED
7083static void task_move_group_fair(struct task_struct *p, int on_rq) 7366static void task_move_group_fair(struct task_struct *p, int on_rq)
7084{ 7367{
7368 struct sched_entity *se = &p->se;
7085 struct cfs_rq *cfs_rq; 7369 struct cfs_rq *cfs_rq;
7370
7086 /* 7371 /*
7087 * If the task was not on the rq at the time of this cgroup movement 7372 * If the task was not on the rq at the time of this cgroup movement
7088 * it must have been asleep, sleeping tasks keep their ->vruntime 7373 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7393,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7108 * To prevent boost or penalty in the new cfs_rq caused by delta 7393 * To prevent boost or penalty in the new cfs_rq caused by delta
7109 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7394 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7110 */ 7395 */
7111 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7396 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7112 on_rq = 1; 7397 on_rq = 1;
7113 7398
7114 if (!on_rq) 7399 if (!on_rq)
7115 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7400 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7116 set_task_rq(p, task_cpu(p)); 7401 set_task_rq(p, task_cpu(p));
7402 se->depth = se->parent ? se->parent->depth + 1 : 0;
7117 if (!on_rq) { 7403 if (!on_rq) {
7118 cfs_rq = cfs_rq_of(&p->se); 7404 cfs_rq = cfs_rq_of(se);
7119 p->se.vruntime += cfs_rq->min_vruntime; 7405 se->vruntime += cfs_rq->min_vruntime;
7120#ifdef CONFIG_SMP 7406#ifdef CONFIG_SMP
7121 /* 7407 /*
7122 * migrate_task_rq_fair() will have removed our previous 7408 * migrate_task_rq_fair() will have removed our previous
7123 * contribution, but we must synchronize for ongoing future 7409 * contribution, but we must synchronize for ongoing future
7124 * decay. 7410 * decay.
7125 */ 7411 */
7126 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7412 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7127 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7413 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7128#endif 7414#endif
7129 } 7415 }
7130} 7416}
@@ -7220,10 +7506,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7220 if (!se) 7506 if (!se)
7221 return; 7507 return;
7222 7508
7223 if (!parent) 7509 if (!parent) {
7224 se->cfs_rq = &rq->cfs; 7510 se->cfs_rq = &rq->cfs;
7225 else 7511 se->depth = 0;
7512 } else {
7226 se->cfs_rq = parent->my_q; 7513 se->cfs_rq = parent->my_q;
7514 se->depth = parent->depth + 1;
7515 }
7227 7516
7228 se->my_q = cfs_rq; 7517 se->my_q = cfs_rq;
7229 /* guarantee group entities always have weight */ 7518 /* guarantee group entities always have weight */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..14ca43430aee
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,144 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
7#include <linux/tick.h>
8#include <linux/mm.h>
9#include <linux/stackprotector.h>
10
11#include <asm/tlb.h>
12
13#include <trace/events/power.h>
14
15static int __read_mostly cpu_idle_force_poll;
16
17void cpu_idle_poll_ctrl(bool enable)
18{
19 if (enable) {
20 cpu_idle_force_poll++;
21 } else {
22 cpu_idle_force_poll--;
23 WARN_ON_ONCE(cpu_idle_force_poll < 0);
24 }
25}
26
27#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
28static int __init cpu_idle_poll_setup(char *__unused)
29{
30 cpu_idle_force_poll = 1;
31 return 1;
32}
33__setup("nohlt", cpu_idle_poll_setup);
34
35static int __init cpu_idle_nopoll_setup(char *__unused)
36{
37 cpu_idle_force_poll = 0;
38 return 1;
39}
40__setup("hlt", cpu_idle_nopoll_setup);
41#endif
42
43static inline int cpu_idle_poll(void)
44{
45 rcu_idle_enter();
46 trace_cpu_idle_rcuidle(0, smp_processor_id());
47 local_irq_enable();
48 while (!tif_need_resched())
49 cpu_relax();
50 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
51 rcu_idle_exit();
52 return 1;
53}
54
55/* Weak implementations for optional arch specific functions */
56void __weak arch_cpu_idle_prepare(void) { }
57void __weak arch_cpu_idle_enter(void) { }
58void __weak arch_cpu_idle_exit(void) { }
59void __weak arch_cpu_idle_dead(void) { }
60void __weak arch_cpu_idle(void)
61{
62 cpu_idle_force_poll = 1;
63 local_irq_enable();
64}
65
66/*
67 * Generic idle loop implementation
68 */
69static void cpu_idle_loop(void)
70{
71 while (1) {
72 tick_nohz_idle_enter();
73
74 while (!need_resched()) {
75 check_pgt_cache();
76 rmb();
77
78 if (cpu_is_offline(smp_processor_id()))
79 arch_cpu_idle_dead();
80
81 local_irq_disable();
82 arch_cpu_idle_enter();
83
84 /*
85 * In poll mode we reenable interrupts and spin.
86 *
87 * Also if we detected in the wakeup from idle
88 * path that the tick broadcast device expired
89 * for us, we don't want to go deep idle as we
90 * know that the IPI is going to arrive right
91 * away
92 */
93 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
94 cpu_idle_poll();
95 } else {
96 if (!current_clr_polling_and_test()) {
97 stop_critical_timings();
98 rcu_idle_enter();
99 if (cpuidle_idle_call())
100 arch_cpu_idle();
101 if (WARN_ON_ONCE(irqs_disabled()))
102 local_irq_enable();
103 rcu_idle_exit();
104 start_critical_timings();
105 } else {
106 local_irq_enable();
107 }
108 __current_set_polling();
109 }
110 arch_cpu_idle_exit();
111 /*
112 * We need to test and propagate the TIF_NEED_RESCHED
113 * bit here because we might not have send the
114 * reschedule IPI to idle tasks.
115 */
116 if (tif_need_resched())
117 set_preempt_need_resched();
118 }
119 tick_nohz_idle_exit();
120 schedule_preempt_disabled();
121 }
122}
123
124void cpu_startup_entry(enum cpuhp_state state)
125{
126 /*
127 * This #ifdef needs to die, but it's too late in the cycle to
128 * make this generic (arm and sh have never invoked the canary
129 * init for the non boot cpus!). Will be fixed in 3.11
130 */
131#ifdef CONFIG_X86
132 /*
133 * If we're the non-boot CPU, nothing set the stack canary up
134 * for us. The boot CPU already has it initialized but no harm
135 * in doing it again. This is a good place for updating it, as
136 * we wont ever return from this function (so the invalid
137 * canaries already on the stack wont ever trigger).
138 */
139 boot_init_stack_canary();
140#endif
141 __current_set_polling();
142 arch_cpu_idle_prepare();
143 cpu_idle_loop();
144}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..f7d03af79a5b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,12 +23,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 if (prev)
30 prev->sched_class->put_prev_task(rq, prev);
31
38 schedstat_inc(rq, sched_goidle); 32 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP 33#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */ 34 idle_enter_fair(rq);
41 rq->post_schedule = 1;
42#endif 35#endif
43 return rq->idle; 36 return rq->idle;
44} 37}
@@ -58,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 51
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 52static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 53{
54#ifdef CONFIG_SMP
55 idle_exit_fair(rq);
56 rq_last_tick_reset(rq);
57#endif
61} 58}
62 59
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 60static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +98,6 @@ const struct sched_class idle_sched_class = {
101 98
102#ifdef CONFIG_SMP 99#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 100 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 101#endif
107 102
108 .set_curr_task = set_curr_task_idle, 103 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..72f9ec759972 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
232static inline int rt_overloaded(struct rq *rq) 234static inline int rt_overloaded(struct rq *rq)
233{ 235{
234 return atomic_read(&rq->rd->rto_count); 236 return atomic_read(&rq->rd->rto_count);
@@ -1310,15 +1312,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1310{ 1312{
1311 struct sched_rt_entity *rt_se; 1313 struct sched_rt_entity *rt_se;
1312 struct task_struct *p; 1314 struct task_struct *p;
1313 struct rt_rq *rt_rq; 1315 struct rt_rq *rt_rq = &rq->rt;
1314
1315 rt_rq = &rq->rt;
1316
1317 if (!rt_rq->rt_nr_running)
1318 return NULL;
1319
1320 if (rt_rq_throttled(rt_rq))
1321 return NULL;
1322 1316
1323 do { 1317 do {
1324 rt_se = pick_next_rt_entity(rq, rt_rq); 1318 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1326,28 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1332 return p; 1326 return p;
1333} 1327}
1334 1328
1335static struct task_struct *pick_next_task_rt(struct rq *rq) 1329static struct task_struct *
1330pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1336{ 1331{
1337 struct task_struct *p = _pick_next_task_rt(rq); 1332 struct task_struct *p;
1333 struct rt_rq *rt_rq = &rq->rt;
1334
1335#ifdef CONFIG_SMP
1336 /* Try to pull RT tasks here if we lower this rq's prio */
1337 if (rq->rt.highest_prio.curr > prev->prio)
1338 pull_rt_task(rq);
1339#endif
1340
1341 if (!rt_rq->rt_nr_running)
1342 return NULL;
1343
1344 if (rt_rq_throttled(rt_rq))
1345 return NULL;
1346
1347 if (prev)
1348 prev->sched_class->put_prev_task(rq, prev);
1349
1350 p = _pick_next_task_rt(rq);
1338 1351
1339 /* The running task is never eligible for pushing */ 1352 /* The running task is never eligible for pushing */
1340 if (p) 1353 if (p)
@@ -1716,13 +1729,6 @@ skip:
1716 return ret; 1729 return ret;
1717} 1730}
1718 1731
1719static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1720{
1721 /* Try to pull RT tasks here if we lower this rq's prio */
1722 if (rq->rt.highest_prio.curr > prev->prio)
1723 pull_rt_task(rq);
1724}
1725
1726static void post_schedule_rt(struct rq *rq) 1732static void post_schedule_rt(struct rq *rq)
1727{ 1733{
1728 push_rt_tasks(rq); 1734 push_rt_tasks(rq);
@@ -1999,7 +2005,6 @@ const struct sched_class rt_sched_class = {
1999 .set_cpus_allowed = set_cpus_allowed_rt, 2005 .set_cpus_allowed = set_cpus_allowed_rt,
2000 .rq_online = rq_online_rt, 2006 .rq_online = rq_online_rt,
2001 .rq_offline = rq_offline_rt, 2007 .rq_offline = rq_offline_rt,
2002 .pre_schedule = pre_schedule_rt,
2003 .post_schedule = post_schedule_rt, 2008 .post_schedule = post_schedule_rt,
2004 .task_woken = task_woken_rt, 2009 .task_woken = task_woken_rt,
2005 .switched_from = switched_from_rt, 2010 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..1bf34c257d3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -1123,14 +1105,19 @@ struct sched_class {
1123 1105
1124 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1106 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1125 1107
1126 struct task_struct * (*pick_next_task) (struct rq *rq); 1108 /*
1109 * It is the responsibility of the pick_next_task() method that will
1110 * return the next task to call put_prev_task() on the @prev task or
1111 * something equivalent.
1112 */
1113 struct task_struct * (*pick_next_task) (struct rq *rq,
1114 struct task_struct *prev);
1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1115 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1128 1116
1129#ifdef CONFIG_SMP 1117#ifdef CONFIG_SMP
1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1118 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1119 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1132 1120
1133 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1134 void (*post_schedule) (struct rq *this_rq); 1121 void (*post_schedule) (struct rq *this_rq);
1135 void (*task_waking) (struct task_struct *task); 1122 void (*task_waking) (struct task_struct *task);
1136 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1123 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1176,7 +1163,7 @@ extern const struct sched_class idle_sched_class;
1176extern void update_group_power(struct sched_domain *sd, int cpu); 1163extern void update_group_power(struct sched_domain *sd, int cpu);
1177 1164
1178extern void trigger_load_balance(struct rq *rq); 1165extern void trigger_load_balance(struct rq *rq);
1179extern void idle_balance(int this_cpu, struct rq *this_rq); 1166extern int idle_balance(struct rq *this_rq);
1180 1167
1181extern void idle_enter_fair(struct rq *this_rq); 1168extern void idle_enter_fair(struct rq *this_rq);
1182extern void idle_exit_fair(struct rq *this_rq); 1169extern void idle_exit_fair(struct rq *this_rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..a4147c9d2017 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 if (prev)
35 prev->sched_class->put_prev_task(rq, prev);
36
37 stop->se.exec_start = rq_clock_task(rq);
38
39 return stop;
36} 40}
37 41
38static void 42static void
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
387 }, 387 },
388 { 388 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 389 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 390 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),