summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 19:23:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 19:23:30 -0400
commit98959948a7ba33cf8c708626e0d2a1456397e1c6 (patch)
tree8ba9b6c2679a06e89f23bdd7018e9bb0249e3bda /kernel
parentef35ad26f8ff44d2c93e29952cdb336bda729d9d (diff)
parentcd3bd4e628a6d57d66afe77835fe8d93ae3e41f8 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Move the nohz kick code out of the scheduler tick to a dedicated IPI, from Frederic Weisbecker. This necessiated quite some background infrastructure rework, including: * Clean up some irq-work internals * Implement remote irq-work * Implement nohz kick on top of remote irq-work * Move full dynticks timer enqueue notification to new kick * Move multi-task notification to new kick * Remove unecessary barriers on multi-task notification - Remove proliferation of wait_on_bit() action functions and allow wait_on_bit_action() functions to support a timeout. (Neil Brown) - Another round of sched/numa improvements, cleanups and fixes. (Rik van Riel) - Implement fast idling of CPUs when the system is partially loaded, for better scalability. (Tim Chen) - Restructure and fix the CPU hotplug handling code that may leave cfs_rq and rt_rq's throttled when tasks are migrated away from a dead cpu. (Kirill Tkhai) - Robustify the sched topology setup code. (Peterz Zijlstra) - Improve sched_feat() handling wrt. static_keys (Jason Baron) - Misc fixes. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits) sched/fair: Fix 'make xmldocs' warning caused by missing description sched: Use macro for magic number of -1 for setparam sched: Robustify topology setup sched: Fix sched_setparam() policy == -1 logic sched: Allow wait_on_bit_action() functions to support a timeout sched: Remove proliferation of wait_on_bit() action functions sched/numa: Revert "Use effective_load() to balance NUMA loads" sched: Fix static_key race with sched_feat() sched: Remove extra static_key*() function indirection sched/rt: Fix replenish_dl_entity() comments to match the current upstream code sched: Transform resched_task() into resched_curr() sched/deadline: Kill task_struct->pi_top_task sched: Rework check_for_tasks() sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() sched/fair: Disable runtime_enabled on dying rq sched/numa: Change scan period code to match intent sched/numa: Rework best node setting in task_numa_migrate() sched/numa: Examine a task move when examining a task swap sched/numa: Simplify task_numa_compare() sched/numa: Use effective_load() to balance NUMA loads ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/irq_work.c110
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/sched/core.c119
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/fair.c244
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sched/wait.c30
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/time/tick-sched.c10
14 files changed, 382 insertions, 274 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
274 rcu_read_unlock(); 274 rcu_read_unlock();
275} 275}
276 276
277static inline void check_for_tasks(int cpu) 277static inline void check_for_tasks(int dead_cpu)
278{ 278{
279 struct task_struct *p; 279 struct task_struct *g, *p;
280 cputime_t utime, stime;
281 280
282 write_lock_irq(&tasklist_lock); 281 read_lock_irq(&tasklist_lock);
283 for_each_process(p) { 282 do_each_thread(g, p) {
284 task_cputime(p, &utime, &stime); 283 if (!p->on_rq)
285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 continue;
286 (utime || stime)) 285 /*
287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", 286 * We do the check with unlocked task_rq(p)->lock.
288 p->comm, task_pid_nr(p), cpu, 287 * Order the reading to do not warn about a task,
289 p->state, p->flags); 288 * which was running on this cpu in the past, and
290 } 289 * it's just been woken on another cpu.
291 write_unlock_irq(&tasklist_lock); 290 */
291 rmb();
292 if (task_cpu(p) != dead_cpu)
293 continue;
294
295 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
296 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
297 } while_each_thread(g, p);
298 read_unlock_irq(&tasklist_lock);
292} 299}
293 300
294struct take_cpu_down_param { 301struct take_cpu_down_param {
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a13c46cd87d..962885edbe53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1095 p->pi_waiters = RB_ROOT; 1095 p->pi_waiters = RB_ROOT;
1096 p->pi_waiters_leftmost = NULL; 1096 p->pi_waiters_leftmost = NULL;
1097 p->pi_blocked_on = NULL; 1097 p->pi_blocked_on = NULL;
1098 p->pi_top_task = NULL;
1099#endif 1098#endif
1100} 1099}
1101 1100
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
16#include <linux/tick.h> 16#include <linux/tick.h>
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h>
19#include <asm/processor.h> 20#include <asm/processor.h>
20 21
21 22
22static DEFINE_PER_CPU(struct llist_head, irq_work_list); 23static DEFINE_PER_CPU(struct llist_head, raised_list);
23static DEFINE_PER_CPU(int, irq_work_raised); 24static DEFINE_PER_CPU(struct llist_head, lazy_list);
24 25
25/* 26/*
26 * Claim the entry so that no one else will poke at it. 27 * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
55 */ 56 */
56} 57}
57 58
59#ifdef CONFIG_SMP
58/* 60/*
59 * Enqueue the irq_work @entry unless it's already pending 61 * Enqueue the irq_work @work on @cpu unless it's already pending
60 * somewhere. 62 * somewhere.
61 * 63 *
62 * Can be re-enqueued while the callback is still in progress. 64 * Can be re-enqueued while the callback is still in progress.
63 */ 65 */
66bool irq_work_queue_on(struct irq_work *work, int cpu)
67{
68 /* All work should have been flushed before going offline */
69 WARN_ON_ONCE(cpu_is_offline(cpu));
70
71 /* Arch remote IPI send/receive backend aren't NMI safe */
72 WARN_ON_ONCE(in_nmi());
73
74 /* Only queue if not already pending */
75 if (!irq_work_claim(work))
76 return false;
77
78 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
79 arch_send_call_function_single_ipi(cpu);
80
81 return true;
82}
83EXPORT_SYMBOL_GPL(irq_work_queue_on);
84#endif
85
86/* Enqueue the irq work @work on the current CPU */
64bool irq_work_queue(struct irq_work *work) 87bool irq_work_queue(struct irq_work *work)
65{ 88{
66 /* Only queue if not already pending */ 89 /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
70 /* Queue the entry and raise the IPI if needed. */ 93 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 94 preempt_disable();
72 95
73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 96 /* If the work is "lazy", handle it from next tick if any */
74 97 if (work->flags & IRQ_WORK_LAZY) {
75 /* 98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
76 * If the work is not "lazy" or the tick is stopped, raise the irq 99 tick_nohz_tick_stopped())
77 * work interrupt (if supported by the arch), otherwise, just wait 100 arch_irq_work_raise();
78 * for the next tick. 101 } else {
79 */ 102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise(); 103 arch_irq_work_raise();
83 } 104 }
84 105
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
90 111
91bool irq_work_needs_cpu(void) 112bool irq_work_needs_cpu(void)
92{ 113{
93 struct llist_head *this_list; 114 struct llist_head *raised, *lazy;
94 115
95 this_list = &__get_cpu_var(irq_work_list); 116 raised = &__get_cpu_var(raised_list);
96 if (llist_empty(this_list)) 117 lazy = &__get_cpu_var(lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy))
97 return false; 119 return false;
98 120
99 /* All work should have been flushed before going offline */ 121 /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
102 return true; 124 return true;
103} 125}
104 126
105static void __irq_work_run(void) 127static void irq_work_run_list(struct llist_head *list)
106{ 128{
107 unsigned long flags; 129 unsigned long flags;
108 struct irq_work *work; 130 struct irq_work *work;
109 struct llist_head *this_list;
110 struct llist_node *llnode; 131 struct llist_node *llnode;
111 132
133 BUG_ON(!irqs_disabled());
112 134
113 /* 135 if (llist_empty(list))
114 * Reset the "raised" state right before we check the list because
115 * an NMI may enqueue after we find the list empty from the runner.
116 */
117 __this_cpu_write(irq_work_raised, 0);
118 barrier();
119
120 this_list = &__get_cpu_var(irq_work_list);
121 if (llist_empty(this_list))
122 return; 136 return;
123 137
124 BUG_ON(!irqs_disabled()); 138 llnode = llist_del_all(list);
125
126 llnode = llist_del_all(this_list);
127 while (llnode != NULL) { 139 while (llnode != NULL) {
128 work = llist_entry(llnode, struct irq_work, llnode); 140 work = llist_entry(llnode, struct irq_work, llnode);
129 141
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
149} 161}
150 162
151/* 163/*
152 * Run the irq_work entries on this cpu. Requires to be ran from hardirq 164 * hotplug calls this through:
153 * context with local IRQs disabled. 165 * hotplug_cfd() -> flush_smp_call_function_queue()
154 */ 166 */
155void irq_work_run(void) 167void irq_work_run(void)
156{ 168{
157 BUG_ON(!in_irq()); 169 irq_work_run_list(&__get_cpu_var(raised_list));
158 __irq_work_run(); 170 irq_work_run_list(&__get_cpu_var(lazy_list));
159} 171}
160EXPORT_SYMBOL_GPL(irq_work_run); 172EXPORT_SYMBOL_GPL(irq_work_run);
161 173
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
171 cpu_relax(); 183 cpu_relax();
172} 184}
173EXPORT_SYMBOL_GPL(irq_work_sync); 185EXPORT_SYMBOL_GPL(irq_work_sync);
174
175#ifdef CONFIG_HOTPLUG_CPU
176static int irq_work_cpu_notify(struct notifier_block *self,
177 unsigned long action, void *hcpu)
178{
179 long cpu = (long)hcpu;
180
181 switch (action) {
182 case CPU_DYING:
183 /* Called from stop_machine */
184 if (WARN_ON_ONCE(cpu != smp_processor_id()))
185 break;
186 __irq_work_run();
187 break;
188 default:
189 break;
190 }
191 return NOTIFY_OK;
192}
193
194static struct notifier_block cpu_notify;
195
196static __init int irq_work_init_cpu_notifier(void)
197{
198 cpu_notify.notifier_call = irq_work_cpu_notify;
199 cpu_notify.priority = 0;
200 register_cpu_notifier(&cpu_notify);
201 return 0;
202}
203device_initcall(irq_work_init_cpu_notifier);
204
205#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
28#include <linux/compat.h> 28#include <linux/compat.h>
29 29
30 30
31static int ptrace_trapping_sleep_fn(void *flags)
32{
33 schedule();
34 return 0;
35}
36
37/* 31/*
38 * ptrace a task: make the debugger its new parent and 32 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 33 * move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
371out: 365out:
372 if (!retval) { 366 if (!retval) {
373 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, 367 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
374 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); 368 TASK_UNINTERRUPTIBLE);
375 proc_ptrace_connector(task, PTRACE_ATTACH); 369 proc_ptrace_connector(task, PTRACE_ATTACH);
376 } 370 }
377 371
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 126f7e3f04e7..1211575a2208 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
139 return; 139 return;
140 140
141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
142 if (delta < 0)
143 return;
142 rq->clock += delta; 144 rq->clock += delta;
143 update_rq_clock_task(rq, delta); 145 update_rq_clock_task(rq, delta);
144} 146}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
243 char buf[64]; 245 char buf[64];
244 char *cmp; 246 char *cmp;
245 int i; 247 int i;
248 struct inode *inode;
246 249
247 if (cnt > 63) 250 if (cnt > 63)
248 cnt = 63; 251 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
253 buf[cnt] = 0; 256 buf[cnt] = 0;
254 cmp = strstrip(buf); 257 cmp = strstrip(buf);
255 258
259 /* Ensure the static_key remains in a consistent state */
260 inode = file_inode(filp);
261 mutex_lock(&inode->i_mutex);
256 i = sched_feat_set(cmp); 262 i = sched_feat_set(cmp);
263 mutex_unlock(&inode->i_mutex);
257 if (i == __SCHED_FEAT_NR) 264 if (i == __SCHED_FEAT_NR)
258 return -EINVAL; 265 return -EINVAL;
259 266
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
587#endif 594#endif
588 595
589/* 596/*
590 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_curr - mark rq's current task 'to be rescheduled now'.
591 * 598 *
592 * On UP this means the setting of the need_resched flag, on SMP it 599 * On UP this means the setting of the need_resched flag, on SMP it
593 * might also involve a cross-CPU call to trigger the scheduler on 600 * might also involve a cross-CPU call to trigger the scheduler on
594 * the target CPU. 601 * the target CPU.
595 */ 602 */
596void resched_task(struct task_struct *p) 603void resched_curr(struct rq *rq)
597{ 604{
605 struct task_struct *curr = rq->curr;
598 int cpu; 606 int cpu;
599 607
600 lockdep_assert_held(&task_rq(p)->lock); 608 lockdep_assert_held(&rq->lock);
601 609
602 if (test_tsk_need_resched(p)) 610 if (test_tsk_need_resched(curr))
603 return; 611 return;
604 612
605 cpu = task_cpu(p); 613 cpu = cpu_of(rq);
606 614
607 if (cpu == smp_processor_id()) { 615 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p); 616 set_tsk_need_resched(curr);
609 set_preempt_need_resched(); 617 set_preempt_need_resched();
610 return; 618 return;
611 } 619 }
612 620
613 if (set_nr_and_not_polling(p)) 621 if (set_nr_and_not_polling(curr))
614 smp_send_reschedule(cpu); 622 smp_send_reschedule(cpu);
615 else 623 else
616 trace_sched_wake_idle_without_ipi(cpu); 624 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
623 631
624 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 632 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
625 return; 633 return;
626 resched_task(cpu_curr(cpu)); 634 resched_curr(rq);
627 raw_spin_unlock_irqrestore(&rq->lock, flags); 635 raw_spin_unlock_irqrestore(&rq->lock, flags);
628} 636}
629 637
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
684 692
685static bool wake_up_full_nohz_cpu(int cpu) 693static bool wake_up_full_nohz_cpu(int cpu)
686{ 694{
695 /*
696 * We just need the target to call irq_exit() and re-evaluate
697 * the next tick. The nohz full kick at least implies that.
698 * If needed we can still optimize that later with an
699 * empty IRQ.
700 */
687 if (tick_nohz_full_cpu(cpu)) { 701 if (tick_nohz_full_cpu(cpu)) {
688 if (cpu != smp_processor_id() || 702 if (cpu != smp_processor_id() ||
689 tick_nohz_tick_stopped()) 703 tick_nohz_tick_stopped())
690 smp_send_reschedule(cpu); 704 tick_nohz_full_kick_cpu(cpu);
691 return true; 705 return true;
692 } 706 }
693 707
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
730#ifdef CONFIG_NO_HZ_FULL 744#ifdef CONFIG_NO_HZ_FULL
731bool sched_can_stop_tick(void) 745bool sched_can_stop_tick(void)
732{ 746{
733 struct rq *rq; 747 /*
734 748 * More than one running task need preemption.
735 rq = this_rq(); 749 * nr_running update is assumed to be visible
736 750 * after IPI is sent from wakers.
737 /* Make sure rq->nr_running update is visible after the IPI */ 751 */
738 smp_rmb(); 752 if (this_rq()->nr_running > 1)
739 753 return false;
740 /* More than one running task need preemption */
741 if (rq->nr_running > 1)
742 return false;
743 754
744 return true; 755 return true;
745} 756}
746#endif /* CONFIG_NO_HZ_FULL */ 757#endif /* CONFIG_NO_HZ_FULL */
747 758
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1022 if (class == rq->curr->sched_class) 1033 if (class == rq->curr->sched_class)
1023 break; 1034 break;
1024 if (class == p->sched_class) { 1035 if (class == p->sched_class) {
1025 resched_task(rq->curr); 1036 resched_curr(rq);
1026 break; 1037 break;
1027 } 1038 }
1028 } 1039 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
1568 */ 1579 */
1569 preempt_fold_need_resched(); 1580 preempt_fold_need_resched();
1570 1581
1571 if (llist_empty(&this_rq()->wake_list) 1582 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1572 && !tick_nohz_full_cpu(smp_processor_id())
1573 && !got_nohz_idle_kick())
1574 return; 1583 return;
1575 1584
1576 /* 1585 /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
1587 * somewhat pessimize the simple resched case. 1596 * somewhat pessimize the simple resched case.
1588 */ 1597 */
1589 irq_enter(); 1598 irq_enter();
1590 tick_nohz_full_check();
1591 sched_ttwu_pending(); 1599 sched_ttwu_pending();
1592 1600
1593 /* 1601 /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2431{ 2439{
2432 u64 ns = 0; 2440 u64 ns = 0;
2433 2441
2434 if (task_current(rq, p)) { 2442 /*
2443 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2444 * project cycles that may never be accounted to this
2445 * thread, breaking clock_gettime().
2446 */
2447 if (task_current(rq, p) && p->on_rq) {
2435 update_rq_clock(rq); 2448 update_rq_clock(rq);
2436 ns = rq_clock_task(rq) - p->se.exec_start; 2449 ns = rq_clock_task(rq) - p->se.exec_start;
2437 if ((s64)ns < 0) 2450 if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2474 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2487 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2475 * If we race with it entering cpu, unaccounted time is 0. This is 2488 * If we race with it entering cpu, unaccounted time is 0. This is
2476 * indistinguishable from the read occurring a few cycles earlier. 2489 * indistinguishable from the read occurring a few cycles earlier.
2490 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2491 * been accounted, so we're correct here as well.
2477 */ 2492 */
2478 if (!p->on_cpu) 2493 if (!p->on_cpu || !p->on_rq)
2479 return p->se.sum_exec_runtime; 2494 return p->se.sum_exec_runtime;
2480#endif 2495#endif
2481 2496
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2971 } 2986 }
2972 2987
2973 trace_sched_pi_setprio(p, prio); 2988 trace_sched_pi_setprio(p, prio);
2974 p->pi_top_task = rt_mutex_get_top_task(p);
2975 oldprio = p->prio; 2989 oldprio = p->prio;
2976 prev_class = p->sched_class; 2990 prev_class = p->sched_class;
2977 on_rq = p->on_rq; 2991 on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 * running task 3005 * running task
2992 */ 3006 */
2993 if (dl_prio(prio)) { 3007 if (dl_prio(prio)) {
2994 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 3008 struct task_struct *pi_task = rt_mutex_get_top_task(p);
2995 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 3009 if (!dl_prio(p->normal_prio) ||
3010 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
2996 p->dl.dl_boosted = 1; 3011 p->dl.dl_boosted = 1;
2997 p->dl.dl_throttled = 0; 3012 p->dl.dl_throttled = 0;
2998 enqueue_flag = ENQUEUE_REPLENISH; 3013 enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
3064 * lowered its priority, then reschedule its CPU: 3079 * lowered its priority, then reschedule its CPU:
3065 */ 3080 */
3066 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3081 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3067 resched_task(rq->curr); 3082 resched_curr(rq);
3068 } 3083 }
3069out_unlock: 3084out_unlock:
3070 task_rq_unlock(rq, p, &flags); 3085 task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3203 dl_se->dl_yielded = 0; 3218 dl_se->dl_yielded = 0;
3204} 3219}
3205 3220
3221/*
3222 * sched_setparam() passes in -1 for its policy, to let the functions
3223 * it calls know not to change it.
3224 */
3225#define SETPARAM_POLICY -1
3226
3206static void __setscheduler_params(struct task_struct *p, 3227static void __setscheduler_params(struct task_struct *p,
3207 const struct sched_attr *attr) 3228 const struct sched_attr *attr)
3208{ 3229{
3209 int policy = attr->sched_policy; 3230 int policy = attr->sched_policy;
3210 3231
3211 if (policy == -1) /* setparam */ 3232 if (policy == SETPARAM_POLICY)
3212 policy = p->policy; 3233 policy = p->policy;
3213 3234
3214 p->policy = policy; 3235 p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
3557 .sched_nice = PRIO_TO_NICE(p->static_prio), 3578 .sched_nice = PRIO_TO_NICE(p->static_prio),
3558 }; 3579 };
3559 3580
3560 /* 3581 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
3561 * Fixup the legacy SCHED_RESET_ON_FORK hack 3582 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
3562 */
3563 if (policy & SCHED_RESET_ON_FORK) {
3564 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3583 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3565 policy &= ~SCHED_RESET_ON_FORK; 3584 policy &= ~SCHED_RESET_ON_FORK;
3566 attr.sched_policy = policy; 3585 attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3730 */ 3749 */
3731SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3750SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3732{ 3751{
3733 return do_sched_setscheduler(pid, -1, param); 3752 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
3734} 3753}
3735 3754
3736/** 3755/**
@@ -4285,7 +4304,7 @@ again:
4285 * fairness. 4304 * fairness.
4286 */ 4305 */
4287 if (preempt && rq != p_rq) 4306 if (preempt && rq != p_rq)
4288 resched_task(p_rq->curr); 4307 resched_curr(p_rq);
4289 } 4308 }
4290 4309
4291out_unlock: 4310out_unlock:
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6465 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6484 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6466 child->parent = sd; 6485 child->parent = sd;
6467 sd->child = child; 6486 sd->child = child;
6487
6488 if (!cpumask_subset(sched_domain_span(child),
6489 sched_domain_span(sd))) {
6490 pr_err("BUG: arch topology borken\n");
6491#ifdef CONFIG_SCHED_DEBUG
6492 pr_err(" the %s domain not a subset of the %s domain\n",
6493 child->name, sd->name);
6494#endif
6495 /* Fixup, ensure @sd has at least @child cpus. */
6496 cpumask_or(sched_domain_span(sd),
6497 sched_domain_span(sd),
6498 sched_domain_span(child));
6499 }
6500
6468 } 6501 }
6469 set_domain_attribute(sd, attr); 6502 set_domain_attribute(sd, attr);
6470 6503
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7092 __setscheduler(rq, p, &attr); 7125 __setscheduler(rq, p, &attr);
7093 if (on_rq) { 7126 if (on_rq) {
7094 enqueue_task(rq, p, 0); 7127 enqueue_task(rq, p, 0);
7095 resched_task(rq->curr); 7128 resched_curr(rq);
7096 } 7129 }
7097 7130
7098 check_class_changed(rq, p, prev_class, old_prio); 7131 check_class_changed(rq, p, prev_class, old_prio);
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7803 if (period > max_cfs_quota_period) 7836 if (period > max_cfs_quota_period)
7804 return -EINVAL; 7837 return -EINVAL;
7805 7838
7839 /*
7840 * Prevent race between setting of cfs_rq->runtime_enabled and
7841 * unthrottle_offline_cfs_rqs().
7842 */
7843 get_online_cpus();
7806 mutex_lock(&cfs_constraints_mutex); 7844 mutex_lock(&cfs_constraints_mutex);
7807 ret = __cfs_schedulable(tg, period, quota); 7845 ret = __cfs_schedulable(tg, period, quota);
7808 if (ret) 7846 if (ret)
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7828 } 7866 }
7829 raw_spin_unlock_irq(&cfs_b->lock); 7867 raw_spin_unlock_irq(&cfs_b->lock);
7830 7868
7831 for_each_possible_cpu(i) { 7869 for_each_online_cpu(i) {
7832 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7870 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7833 struct rq *rq = cfs_rq->rq; 7871 struct rq *rq = cfs_rq->rq;
7834 7872
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7844 cfs_bandwidth_usage_dec(); 7882 cfs_bandwidth_usage_dec();
7845out_unlock: 7883out_unlock:
7846 mutex_unlock(&cfs_constraints_mutex); 7884 mutex_unlock(&cfs_constraints_mutex);
7885 put_online_cpus();
7847 7886
7848 return ret; 7887 return ret;
7849} 7888}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
306 * the overrunning entity can't interfere with other entity in the system and 306 * the overrunning entity can't interfere with other entity in the system and
307 * can't make them miss their deadlines. Reasons why this kind of overruns 307 * can't make them miss their deadlines. Reasons why this kind of overruns
308 * could happen are, typically, a entity voluntarily trying to overcome its 308 * could happen are, typically, a entity voluntarily trying to overcome its
309 * runtime, or it just underestimated it during sched_setscheduler_ex(). 309 * runtime, or it just underestimated it during sched_setattr().
310 */ 310 */
311static void replenish_dl_entity(struct sched_dl_entity *dl_se, 311static void replenish_dl_entity(struct sched_dl_entity *dl_se,
312 struct sched_dl_entity *pi_se) 312 struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
537 else 537 else
538 resched_task(rq->curr); 538 resched_curr(rq);
539#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
540 /* 540 /*
541 * Queueing this task back might have overloaded rq, 541 * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
635 635
636 if (!is_leftmost(curr, &rq->dl)) 636 if (!is_leftmost(curr, &rq->dl))
637 resched_task(curr); 637 resched_curr(rq);
638 } 638 }
639 639
640 /* 640 /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1) 964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
965 return; 965 return;
966 966
967 resched_task(rq->curr); 967 resched_curr(rq);
968} 968}
969 969
970static int pull_dl_task(struct rq *this_rq); 970static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
979 int flags) 979 int flags)
980{ 980{
981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
982 resched_task(rq->curr); 982 resched_curr(rq);
983 return; 983 return;
984 } 984 }
985 985
@@ -1333,7 +1333,7 @@ retry:
1333 if (dl_task(rq->curr) && 1333 if (dl_task(rq->curr) &&
1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1335 rq->curr->nr_cpus_allowed > 1) { 1335 rq->curr->nr_cpus_allowed > 1) {
1336 resched_task(rq->curr); 1336 resched_curr(rq);
1337 return 0; 1337 return 0;
1338 } 1338 }
1339 1339
@@ -1373,7 +1373,7 @@ retry:
1373 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1374 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 1375
1376 resched_task(later_rq->curr); 1376 resched_curr(later_rq);
1377 1377
1378 double_unlock_balance(rq, later_rq); 1378 double_unlock_balance(rq, later_rq);
1379 1379
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1632 */ 1632 */
1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && 1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1634 rq->curr == p) 1634 rq->curr == p)
1635 resched_task(p); 1635 resched_curr(rq);
1636#else 1636#else
1637 /* 1637 /*
1638 * Again, we don't know if p has a earlier 1638 * Again, we don't know if p has a earlier
1639 * or later deadline, so let's blindly set a 1639 * or later deadline, so let's blindly set a
1640 * (maybe not needed) rescheduling point. 1640 * (maybe not needed) rescheduling point.
1641 */ 1641 */
1642 resched_task(p); 1642 resched_curr(rq);
1643#endif /* CONFIG_SMP */ 1643#endif /* CONFIG_SMP */
1644 } else 1644 } else
1645 switched_to_dl(rq, p); 1645 switched_to_dl(rq, p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->task_capacity = 1065 ns->task_capacity =
1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
1096 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1097} 1096}
1098 1097
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, 1098static bool load_too_imbalanced(long src_load, long dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env) 1099 struct task_numa_env *env)
1102{ 1100{
1103 long imb, old_imb; 1101 long imb, old_imb;
1102 long orig_src_load, orig_dst_load;
1103 long src_capacity, dst_capacity;
1104
1105 /*
1106 * The load is corrected for the CPU capacity available on each node.
1107 *
1108 * src_load dst_load
1109 * ------------ vs ---------
1110 * src_capacity dst_capacity
1111 */
1112 src_capacity = env->src_stats.compute_capacity;
1113 dst_capacity = env->dst_stats.compute_capacity;
1104 1114
1105 /* We care about the slope of the imbalance, not the direction. */ 1115 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load) 1116 if (dst_load < src_load)
1107 swap(dst_load, src_load); 1117 swap(dst_load, src_load);
1108 1118
1109 /* Is the difference below the threshold? */ 1119 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct; 1120 imb = dst_load * src_capacity * 100 -
1121 src_load * dst_capacity * env->imbalance_pct;
1111 if (imb <= 0) 1122 if (imb <= 0)
1112 return false; 1123 return false;
1113 1124
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1115 * The imbalance is above the allowed threshold. 1126 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance. 1127 * Compare it with the old imbalance.
1117 */ 1128 */
1129 orig_src_load = env->src_stats.load;
1130 orig_dst_load = env->dst_stats.load;
1131
1118 if (orig_dst_load < orig_src_load) 1132 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load); 1133 swap(orig_dst_load, orig_src_load);
1120 1134
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; 1135 old_imb = orig_dst_load * src_capacity * 100 -
1136 orig_src_load * dst_capacity * env->imbalance_pct;
1122 1137
1123 /* Would this change make things worse? */ 1138 /* Would this change make things worse? */
1124 return (imb > old_imb); 1139 return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
1136 struct rq *src_rq = cpu_rq(env->src_cpu); 1151 struct rq *src_rq = cpu_rq(env->src_cpu);
1137 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1152 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1138 struct task_struct *cur; 1153 struct task_struct *cur;
1139 long orig_src_load, src_load; 1154 long src_load, dst_load;
1140 long orig_dst_load, dst_load;
1141 long load; 1155 long load;
1142 long imp = (groupimp > 0) ? groupimp : taskimp; 1156 long imp = env->p->numa_group ? groupimp : taskimp;
1157 long moveimp = imp;
1143 1158
1144 rcu_read_lock(); 1159 rcu_read_lock();
1145 cur = ACCESS_ONCE(dst_rq->curr); 1160 cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
1177 * itself (not part of a group), use the task weight 1192 * itself (not part of a group), use the task weight
1178 * instead. 1193 * instead.
1179 */ 1194 */
1180 if (env->p->numa_group)
1181 imp = groupimp;
1182 else
1183 imp = taskimp;
1184
1185 if (cur->numa_group) 1195 if (cur->numa_group)
1186 imp += group_weight(cur, env->src_nid) - 1196 imp += group_weight(cur, env->src_nid) -
1187 group_weight(cur, env->dst_nid); 1197 group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
1191 } 1201 }
1192 } 1202 }
1193 1203
1194 if (imp < env->best_imp) 1204 if (imp <= env->best_imp && moveimp <= env->best_imp)
1195 goto unlock; 1205 goto unlock;
1196 1206
1197 if (!cur) { 1207 if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
1204 } 1214 }
1205 1215
1206 /* Balance doesn't matter much if we're running a task per cpu */ 1216 /* Balance doesn't matter much if we're running a task per cpu */
1207 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) 1217 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1218 dst_rq->nr_running == 1)
1208 goto assign; 1219 goto assign;
1209 1220
1210 /* 1221 /*
1211 * In the overloaded case, try and keep the load balanced. 1222 * In the overloaded case, try and keep the load balanced.
1212 */ 1223 */
1213balance: 1224balance:
1214 orig_dst_load = env->dst_stats.load;
1215 orig_src_load = env->src_stats.load;
1216
1217 /* XXX missing capacity terms */
1218 load = task_h_load(env->p); 1225 load = task_h_load(env->p);
1219 dst_load = orig_dst_load + load; 1226 dst_load = env->dst_stats.load + load;
1220 src_load = orig_src_load - load; 1227 src_load = env->src_stats.load - load;
1228
1229 if (moveimp > imp && moveimp > env->best_imp) {
1230 /*
1231 * If the improvement from just moving env->p direction is
1232 * better than swapping tasks around, check if a move is
1233 * possible. Store a slightly smaller score than moveimp,
1234 * so an actually idle CPU will win.
1235 */
1236 if (!load_too_imbalanced(src_load, dst_load, env)) {
1237 imp = moveimp - 1;
1238 cur = NULL;
1239 goto assign;
1240 }
1241 }
1242
1243 if (imp <= env->best_imp)
1244 goto unlock;
1221 1245
1222 if (cur) { 1246 if (cur) {
1223 load = task_h_load(cur); 1247 load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
1225 src_load += load; 1249 src_load += load;
1226 } 1250 }
1227 1251
1228 if (load_too_imbalanced(orig_src_load, orig_dst_load, 1252 if (load_too_imbalanced(src_load, dst_load, env))
1229 src_load, dst_load, env))
1230 goto unlock; 1253 goto unlock;
1231 1254
1232assign: 1255assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
1302 groupimp = group_weight(p, env.dst_nid) - groupweight; 1325 groupimp = group_weight(p, env.dst_nid) - groupweight;
1303 update_numa_stats(&env.dst_stats, env.dst_nid); 1326 update_numa_stats(&env.dst_stats, env.dst_nid);
1304 1327
1305 /* If the preferred nid has free capacity, try to use it. */ 1328 /* Try to find a spot on the preferred nid. */
1306 if (env.dst_stats.has_free_capacity) 1329 task_numa_find_cpu(&env, taskimp, groupimp);
1307 task_numa_find_cpu(&env, taskimp, groupimp);
1308 1330
1309 /* No space available on the preferred nid. Look elsewhere. */ 1331 /* No space available on the preferred nid. Look elsewhere. */
1310 if (env.best_cpu == -1) { 1332 if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
1324 } 1346 }
1325 } 1347 }
1326 1348
1327 /* No better CPU than the current one was found. */
1328 if (env.best_cpu == -1)
1329 return -EAGAIN;
1330
1331 /* 1349 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes, 1350 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember 1351 * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
1336 * A task that migrated to a second choice node will be better off 1354 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here. 1355 * trying for a better one later. Do not set the preferred node here.
1338 */ 1356 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) 1357 if (p->numa_group) {
1340 sched_setnuma(p, env.dst_nid); 1358 if (env.best_cpu == -1)
1359 nid = env.src_nid;
1360 else
1361 nid = env.dst_nid;
1362
1363 if (node_isset(nid, p->numa_group->active_nodes))
1364 sched_setnuma(p, env.dst_nid);
1365 }
1366
1367 /* No better CPU than the current one was found. */
1368 if (env.best_cpu == -1)
1369 return -EAGAIN;
1341 1370
1342 /* 1371 /*
1343 * Reset the scan period if the task is being rescheduled on an 1372 * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1415/* 1444/*
1416 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1445 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1417 * increments. The more local the fault statistics are, the higher the scan 1446 * increments. The more local the fault statistics are, the higher the scan
1418 * period will be for the next scan window. If local/remote ratio is below 1447 * period will be for the next scan window. If local/(local+remote) ratio is
1419 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the 1448 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1420 * scan period will decrease 1449 * the scan period will decrease. Aim for 70% local accesses.
1421 */ 1450 */
1422#define NUMA_PERIOD_SLOTS 10 1451#define NUMA_PERIOD_SLOTS 10
1423#define NUMA_PERIOD_THRESHOLD 3 1452#define NUMA_PERIOD_THRESHOLD 7
1424 1453
1425/* 1454/*
1426 * Increase the scan period (slow down scanning) if the majority of 1455 * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
1595 1624
1596 if (p->numa_group) { 1625 if (p->numa_group) {
1597 update_numa_active_node_mask(p->numa_group); 1626 update_numa_active_node_mask(p->numa_group);
1598 /*
1599 * If the preferred task and group nids are different,
1600 * iterate over the nodes again to find the best place.
1601 */
1602 if (max_nid != max_group_nid) {
1603 unsigned long weight, max_weight = 0;
1604
1605 for_each_online_node(nid) {
1606 weight = task_weight(p, nid) + group_weight(p, nid);
1607 if (weight > max_weight) {
1608 max_weight = weight;
1609 max_nid = nid;
1610 }
1611 }
1612 }
1613
1614 spin_unlock_irq(group_lock); 1627 spin_unlock_irq(group_lock);
1628 max_nid = max_group_nid;
1615 } 1629 }
1616 1630
1617 /* Preferred node as the node with the most faults */ 1631 if (max_faults) {
1618 if (max_faults && max_nid != p->numa_preferred_nid) { 1632 /* Set the new preferred node */
1619 /* Update the preferred nid and migrate task if possible */ 1633 if (max_nid != p->numa_preferred_nid)
1620 sched_setnuma(p, max_nid); 1634 sched_setnuma(p, max_nid);
1621 numa_migrate_preferred(p); 1635
1636 if (task_node(p) != p->numa_preferred_nid)
1637 numa_migrate_preferred(p);
1622 } 1638 }
1623} 1639}
1624 1640
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2899 ideal_runtime = sched_slice(cfs_rq, curr); 2915 ideal_runtime = sched_slice(cfs_rq, curr);
2900 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 2916 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2901 if (delta_exec > ideal_runtime) { 2917 if (delta_exec > ideal_runtime) {
2902 resched_task(rq_of(cfs_rq)->curr); 2918 resched_curr(rq_of(cfs_rq));
2903 /* 2919 /*
2904 * The current task ran long enough, ensure it doesn't get 2920 * The current task ran long enough, ensure it doesn't get
2905 * re-elected due to buddy favours. 2921 * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2923 return; 2939 return;
2924 2940
2925 if (delta > ideal_runtime) 2941 if (delta > ideal_runtime)
2926 resched_task(rq_of(cfs_rq)->curr); 2942 resched_curr(rq_of(cfs_rq));
2927} 2943}
2928 2944
2929static void 2945static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3063 * validating it and just reschedule. 3079 * validating it and just reschedule.
3064 */ 3080 */
3065 if (queued) { 3081 if (queued) {
3066 resched_task(rq_of(cfs_rq)->curr); 3082 resched_curr(rq_of(cfs_rq));
3067 return; 3083 return;
3068 } 3084 }
3069 /* 3085 /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3254 * hierarchy can be throttled 3270 * hierarchy can be throttled
3255 */ 3271 */
3256 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 3272 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3257 resched_task(rq_of(cfs_rq)->curr); 3273 resched_curr(rq_of(cfs_rq));
3258} 3274}
3259 3275
3260static __always_inline 3276static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3360 cfs_rq->throttled = 1; 3376 cfs_rq->throttled = 1;
3361 cfs_rq->throttled_clock = rq_clock(rq); 3377 cfs_rq->throttled_clock = rq_clock(rq);
3362 raw_spin_lock(&cfs_b->lock); 3378 raw_spin_lock(&cfs_b->lock);
3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3379 /*
3380 * Add to the _head_ of the list, so that an already-started
3381 * distribute_cfs_runtime will not see us
3382 */
3383 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3364 if (!cfs_b->timer_active) 3384 if (!cfs_b->timer_active)
3365 __start_cfs_bandwidth(cfs_b, false); 3385 __start_cfs_bandwidth(cfs_b, false);
3366 raw_spin_unlock(&cfs_b->lock); 3386 raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3410 3430
3411 /* determine whether we need to wake up potentially idle cpu */ 3431 /* determine whether we need to wake up potentially idle cpu */
3412 if (rq->curr == rq->idle && rq->cfs.nr_running) 3432 if (rq->curr == rq->idle && rq->cfs.nr_running)
3413 resched_task(rq->curr); 3433 resched_curr(rq);
3414} 3434}
3415 3435
3416static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 3436static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3417 u64 remaining, u64 expires) 3437 u64 remaining, u64 expires)
3418{ 3438{
3419 struct cfs_rq *cfs_rq; 3439 struct cfs_rq *cfs_rq;
3420 u64 runtime = remaining; 3440 u64 runtime;
3441 u64 starting_runtime = remaining;
3421 3442
3422 rcu_read_lock(); 3443 rcu_read_lock();
3423 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 3444 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
3448 } 3469 }
3449 rcu_read_unlock(); 3470 rcu_read_unlock();
3450 3471
3451 return remaining; 3472 return starting_runtime - remaining;
3452} 3473}
3453 3474
3454/* 3475/*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3494 /* account preceding periods in which throttling occurred */ 3515 /* account preceding periods in which throttling occurred */
3495 cfs_b->nr_throttled += overrun; 3516 cfs_b->nr_throttled += overrun;
3496 3517
3497 /*
3498 * There are throttled entities so we must first use the new bandwidth
3499 * to unthrottle them before making it generally available. This
3500 * ensures that all existing debts will be paid before a new cfs_rq is
3501 * allowed to run.
3502 */
3503 runtime = cfs_b->runtime;
3504 runtime_expires = cfs_b->runtime_expires; 3518 runtime_expires = cfs_b->runtime_expires;
3505 cfs_b->runtime = 0;
3506 3519
3507 /* 3520 /*
3508 * This check is repeated as we are holding onto the new bandwidth 3521 * This check is repeated as we are holding onto the new bandwidth while
3509 * while we unthrottle. This can potentially race with an unthrottled 3522 * we unthrottle. This can potentially race with an unthrottled group
3510 * group trying to acquire new bandwidth from the global pool. 3523 * trying to acquire new bandwidth from the global pool. This can result
3524 * in us over-using our runtime if it is all used during this loop, but
3525 * only by limited amounts in that extreme case.
3511 */ 3526 */
3512 while (throttled && runtime > 0) { 3527 while (throttled && cfs_b->runtime > 0) {
3528 runtime = cfs_b->runtime;
3513 raw_spin_unlock(&cfs_b->lock); 3529 raw_spin_unlock(&cfs_b->lock);
3514 /* we can't nest cfs_b->lock while distributing bandwidth */ 3530 /* we can't nest cfs_b->lock while distributing bandwidth */
3515 runtime = distribute_cfs_runtime(cfs_b, runtime, 3531 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3517 raw_spin_lock(&cfs_b->lock); 3533 raw_spin_lock(&cfs_b->lock);
3518 3534
3519 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3535 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3536
3537 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3520 } 3538 }
3521 3539
3522 /* return (any) remaining runtime */
3523 cfs_b->runtime = runtime;
3524 /* 3540 /*
3525 * While we are ensured activity in the period following an 3541 * While we are ensured activity in the period following an
3526 * unthrottle, this also covers the case in which the new bandwidth is 3542 * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3631 return; 3647 return;
3632 } 3648 }
3633 3649
3634 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3650 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3635 runtime = cfs_b->runtime; 3651 runtime = cfs_b->runtime;
3636 cfs_b->runtime = 0; 3652
3637 }
3638 expires = cfs_b->runtime_expires; 3653 expires = cfs_b->runtime_expires;
3639 raw_spin_unlock(&cfs_b->lock); 3654 raw_spin_unlock(&cfs_b->lock);
3640 3655
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3645 3660
3646 raw_spin_lock(&cfs_b->lock); 3661 raw_spin_lock(&cfs_b->lock);
3647 if (expires == cfs_b->runtime_expires) 3662 if (expires == cfs_b->runtime_expires)
3648 cfs_b->runtime = runtime; 3663 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3649 raw_spin_unlock(&cfs_b->lock); 3664 raw_spin_unlock(&cfs_b->lock);
3650} 3665}
3651 3666
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3775 hrtimer_cancel(&cfs_b->slack_timer); 3790 hrtimer_cancel(&cfs_b->slack_timer);
3776} 3791}
3777 3792
3793static void __maybe_unused update_runtime_enabled(struct rq *rq)
3794{
3795 struct cfs_rq *cfs_rq;
3796
3797 for_each_leaf_cfs_rq(rq, cfs_rq) {
3798 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
3799
3800 raw_spin_lock(&cfs_b->lock);
3801 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
3802 raw_spin_unlock(&cfs_b->lock);
3803 }
3804}
3805
3778static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 3806static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3779{ 3807{
3780 struct cfs_rq *cfs_rq; 3808 struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3788 * there's some valid quota amount 3816 * there's some valid quota amount
3789 */ 3817 */
3790 cfs_rq->runtime_remaining = 1; 3818 cfs_rq->runtime_remaining = 1;
3819 /*
3820 * Offline rq is schedulable till cpu is completely disabled
3821 * in take_cpu_down(), so we prevent new cfs throttling here.
3822 */
3823 cfs_rq->runtime_enabled = 0;
3824
3791 if (cfs_rq_throttled(cfs_rq)) 3825 if (cfs_rq_throttled(cfs_rq))
3792 unthrottle_cfs_rq(cfs_rq); 3826 unthrottle_cfs_rq(cfs_rq);
3793 } 3827 }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3831 return NULL; 3865 return NULL;
3832} 3866}
3833static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 3867static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3868static inline void update_runtime_enabled(struct rq *rq) {}
3834static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 3869static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3835 3870
3836#endif /* CONFIG_CFS_BANDWIDTH */ 3871#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3854 3889
3855 if (delta < 0) { 3890 if (delta < 0) {
3856 if (rq->curr == p) 3891 if (rq->curr == p)
3857 resched_task(p); 3892 resched_curr(rq);
3858 return; 3893 return;
3859 } 3894 }
3860 3895
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4723 return; 4758 return;
4724 4759
4725preempt: 4760preempt:
4726 resched_task(curr); 4761 resched_curr(rq);
4727 /* 4762 /*
4728 * Only set the backward buddy when the current task is still 4763 * Only set the backward buddy when the current task is still
4729 * on the rq. This can happen when a wakeup gets interleaved 4764 * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
5094/* 5129/*
5095 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5096 */ 5131 */
5097static int 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5098task_hot(struct task_struct *p, u64 now)
5099{ 5133{
5100 s64 delta; 5134 s64 delta;
5101 5135
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
5108 /* 5142 /*
5109 * Buddy candidates are cache hot: 5143 * Buddy candidates are cache hot:
5110 */ 5144 */
5111 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 5145 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5112 (&p->se == cfs_rq_of(&p->se)->next || 5146 (&p->se == cfs_rq_of(&p->se)->next ||
5113 &p->se == cfs_rq_of(&p->se)->last)) 5147 &p->se == cfs_rq_of(&p->se)->last))
5114 return 1; 5148 return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
5118 if (sysctl_sched_migration_cost == 0) 5152 if (sysctl_sched_migration_cost == 0)
5119 return 0; 5153 return 0;
5120 5154
5121 delta = now - p->se.exec_start; 5155 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5122 5156
5123 return delta < (s64)sysctl_sched_migration_cost; 5157 return delta < (s64)sysctl_sched_migration_cost;
5124} 5158}
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5272 * 2) task is cache cold, or 5306 * 2) task is cache cold, or
5273 * 3) too many balance attempts have failed. 5307 * 3) too many balance attempts have failed.
5274 */ 5308 */
5275 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); 5309 tsk_cache_hot = task_hot(p, env);
5276 if (!tsk_cache_hot) 5310 if (!tsk_cache_hot)
5277 tsk_cache_hot = migrate_degrades_locality(p, env); 5311 tsk_cache_hot = migrate_degrades_locality(p, env);
5278 5312
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5864 * @load_idx: Load index of sched_domain of this_cpu for load calc. 5898 * @load_idx: Load index of sched_domain of this_cpu for load calc.
5865 * @local_group: Does group contain this_cpu. 5899 * @local_group: Does group contain this_cpu.
5866 * @sgs: variable to hold the statistics for this group. 5900 * @sgs: variable to hold the statistics for this group.
5901 * @overload: Indicate more than one runnable task for any CPU.
5867 */ 5902 */
5868static inline void update_sg_lb_stats(struct lb_env *env, 5903static inline void update_sg_lb_stats(struct lb_env *env,
5869 struct sched_group *group, int load_idx, 5904 struct sched_group *group, int load_idx,
5870 int local_group, struct sg_lb_stats *sgs) 5905 int local_group, struct sg_lb_stats *sgs,
5906 bool *overload)
5871{ 5907{
5872 unsigned long load; 5908 unsigned long load;
5873 int i; 5909 int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5885 5921
5886 sgs->group_load += load; 5922 sgs->group_load += load;
5887 sgs->sum_nr_running += rq->nr_running; 5923 sgs->sum_nr_running += rq->nr_running;
5924
5925 if (rq->nr_running > 1)
5926 *overload = true;
5927
5888#ifdef CONFIG_NUMA_BALANCING 5928#ifdef CONFIG_NUMA_BALANCING
5889 sgs->nr_numa_running += rq->nr_numa_running; 5929 sgs->nr_numa_running += rq->nr_numa_running;
5890 sgs->nr_preferred_running += rq->nr_preferred_running; 5930 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5995 struct sched_group *sg = env->sd->groups; 6035 struct sched_group *sg = env->sd->groups;
5996 struct sg_lb_stats tmp_sgs; 6036 struct sg_lb_stats tmp_sgs;
5997 int load_idx, prefer_sibling = 0; 6037 int load_idx, prefer_sibling = 0;
6038 bool overload = false;
5998 6039
5999 if (child && child->flags & SD_PREFER_SIBLING) 6040 if (child && child->flags & SD_PREFER_SIBLING)
6000 prefer_sibling = 1; 6041 prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6015 update_group_capacity(env->sd, env->dst_cpu); 6056 update_group_capacity(env->sd, env->dst_cpu);
6016 } 6057 }
6017 6058
6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6059 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6060 &overload);
6019 6061
6020 if (local_group) 6062 if (local_group)
6021 goto next_group; 6063 goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
6049 6091
6050 if (env->sd->flags & SD_NUMA) 6092 if (env->sd->flags & SD_NUMA)
6051 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 6093 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6094
6095 if (!env->sd->parent) {
6096 /* update overload indicator if we are at root domain */
6097 if (env->dst_rq->rd->overload != overload)
6098 env->dst_rq->rd->overload = overload;
6099 }
6100
6052} 6101}
6053 6102
6054/** 6103/**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
6767 */ 6816 */
6768 this_rq->idle_stamp = rq_clock(this_rq); 6817 this_rq->idle_stamp = rq_clock(this_rq);
6769 6818
6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) { 6819 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
6820 !this_rq->rd->overload) {
6771 rcu_read_lock(); 6821 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd); 6822 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd) 6823 if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
7325static void rq_online_fair(struct rq *rq) 7375static void rq_online_fair(struct rq *rq)
7326{ 7376{
7327 update_sysctl(); 7377 update_sysctl();
7378
7379 update_runtime_enabled(rq);
7328} 7380}
7329 7381
7330static void rq_offline_fair(struct rq *rq) 7382static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
7398 * 'current' within the tree based on its new key value. 7450 * 'current' within the tree based on its new key value.
7399 */ 7451 */
7400 swap(curr->vruntime, se->vruntime); 7452 swap(curr->vruntime, se->vruntime);
7401 resched_task(rq->curr); 7453 resched_curr(rq);
7402 } 7454 }
7403 7455
7404 se->vruntime -= cfs_rq->min_vruntime; 7456 se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7423 */ 7475 */
7424 if (rq->curr == p) { 7476 if (rq->curr == p) {
7425 if (p->prio > oldprio) 7477 if (p->prio > oldprio)
7426 resched_task(rq->curr); 7478 resched_curr(rq);
7427 } else 7479 } else
7428 check_preempt_curr(rq, p, 0); 7480 check_preempt_curr(rq, p, 0);
7429} 7481}
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7486 * if we can still preempt the current task. 7538 * if we can still preempt the current task.
7487 */ 7539 */
7488 if (rq->curr == p) 7540 if (rq->curr == p)
7489 resched_task(rq->curr); 7541 resched_curr(rq);
7490 else 7542 else
7491 check_preempt_curr(rq, p, 0); 7543 check_preempt_curr(rq, p, 0);
7492} 7544}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..9f1608f99819 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
81 int next_state, entered_state; 81 int next_state, entered_state;
82 bool broadcast; 82 unsigned int broadcast;
83 83
84 /* 84 /*
85 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
135 goto exit_idle; 135 goto exit_idle;
136 } 136 }
137 137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); 138 broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
139 139
140 /* 140 /*
141 * Tell the time framework to switch to a broadcast timer 141 * Tell the time framework to switch to a broadcast timer
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
20 */ 20 */
21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) 21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
22{ 22{
23 resched_task(rq->idle); 23 resched_curr(rq);
24} 24}
25 25
26static struct task_struct * 26static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
464{ 464{
465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
466 struct rq *rq = rq_of_rt_rq(rt_rq);
466 struct sched_rt_entity *rt_se; 467 struct sched_rt_entity *rt_se;
467 468
468 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 469 int cpu = cpu_of(rq);
469 470
470 rt_se = rt_rq->tg->rt_se[cpu]; 471 rt_se = rt_rq->tg->rt_se[cpu];
471 472
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
476 enqueue_rt_entity(rt_se, false); 477 enqueue_rt_entity(rt_se, false);
477 478
478 if (rt_rq->highest_prio.curr < curr->prio) 479 if (rt_rq->highest_prio.curr < curr->prio)
479 resched_task(curr); 480 resched_curr(rq);
480 } 481 }
481} 482}
482 483
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
566 return; 567 return;
567 568
568 enqueue_top_rt_rq(rt_rq); 569 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr); 570 resched_curr(rq);
570} 571}
571 572
572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 573static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
740 rt_rq->rt_throttled = 0; 741 rt_rq->rt_throttled = 0;
741 raw_spin_unlock(&rt_rq->rt_runtime_lock); 742 raw_spin_unlock(&rt_rq->rt_runtime_lock);
742 raw_spin_unlock(&rt_b->rt_runtime_lock); 743 raw_spin_unlock(&rt_b->rt_runtime_lock);
744
745 /* Make rt_rq available for pick_next_task() */
746 sched_rt_rq_enqueue(rt_rq);
743 } 747 }
744} 748}
745 749
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
948 raw_spin_lock(&rt_rq->rt_runtime_lock); 952 raw_spin_lock(&rt_rq->rt_runtime_lock);
949 rt_rq->rt_time += delta_exec; 953 rt_rq->rt_time += delta_exec;
950 if (sched_rt_runtime_exceeded(rt_rq)) 954 if (sched_rt_runtime_exceeded(rt_rq))
951 resched_task(curr); 955 resched_curr(rq);
952 raw_spin_unlock(&rt_rq->rt_runtime_lock); 956 raw_spin_unlock(&rt_rq->rt_runtime_lock);
953 } 957 }
954 } 958 }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1363 * to try and push current away: 1367 * to try and push current away:
1364 */ 1368 */
1365 requeue_task_rt(rq, p, 1); 1369 requeue_task_rt(rq, p, 1);
1366 resched_task(rq->curr); 1370 resched_curr(rq);
1367} 1371}
1368 1372
1369#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1374static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1378static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1375{ 1379{
1376 if (p->prio < rq->curr->prio) { 1380 if (p->prio < rq->curr->prio) {
1377 resched_task(rq->curr); 1381 resched_curr(rq);
1378 return; 1382 return;
1379 } 1383 }
1380 1384
@@ -1690,7 +1694,7 @@ retry:
1690 * just reschedule current. 1694 * just reschedule current.
1691 */ 1695 */
1692 if (unlikely(next_task->prio < rq->curr->prio)) { 1696 if (unlikely(next_task->prio < rq->curr->prio)) {
1693 resched_task(rq->curr); 1697 resched_curr(rq);
1694 return 0; 1698 return 0;
1695 } 1699 }
1696 1700
@@ -1737,7 +1741,7 @@ retry:
1737 activate_task(lowest_rq, next_task, 0); 1741 activate_task(lowest_rq, next_task, 0);
1738 ret = 1; 1742 ret = 1;
1739 1743
1740 resched_task(lowest_rq->curr); 1744 resched_curr(lowest_rq);
1741 1745
1742 double_unlock_balance(rq, lowest_rq); 1746 double_unlock_balance(rq, lowest_rq);
1743 1747
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 return; 1940 return;
1937 1941
1938 if (pull_rt_task(rq)) 1942 if (pull_rt_task(rq))
1939 resched_task(rq->curr); 1943 resched_curr(rq);
1940} 1944}
1941 1945
1942void __init init_sched_rt_class(void) 1946void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1974 check_resched = 0; 1978 check_resched = 0;
1975#endif /* CONFIG_SMP */ 1979#endif /* CONFIG_SMP */
1976 if (check_resched && p->prio < rq->curr->prio) 1980 if (check_resched && p->prio < rq->curr->prio)
1977 resched_task(rq->curr); 1981 resched_curr(rq);
1978 } 1982 }
1979} 1983}
1980 1984
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2003 * Only reschedule if p is still on the same runqueue. 2007 * Only reschedule if p is still on the same runqueue.
2004 */ 2008 */
2005 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2009 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2006 resched_task(p); 2010 resched_curr(rq);
2007#else 2011#else
2008 /* For UP simply resched on drop of prio */ 2012 /* For UP simply resched on drop of prio */
2009 if (oldprio < p->prio) 2013 if (oldprio < p->prio)
2010 resched_task(p); 2014 resched_curr(rq);
2011#endif /* CONFIG_SMP */ 2015#endif /* CONFIG_SMP */
2012 } else { 2016 } else {
2013 /* 2017 /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2016 * then reschedule. 2020 * then reschedule.
2017 */ 2021 */
2018 if (p->prio < rq->curr->prio) 2022 if (p->prio < rq->curr->prio)
2019 resched_task(rq->curr); 2023 resched_curr(rq);
2020 } 2024 }
2021} 2025}
2022 2026
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
477 cpumask_var_t span; 477 cpumask_var_t span;
478 cpumask_var_t online; 478 cpumask_var_t online;
479 479
480 /* Indicate more than one runnable task for any CPU */
481 bool overload;
482
480 /* 483 /*
481 * The bit corresponding to a CPU gets set here if such CPU has more 484 * The bit corresponding to a CPU gets set here if such CPU has more
482 * than one runnable -deadline task (as it is below for RT tasks). 485 * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
884#undef SCHED_FEAT 887#undef SCHED_FEAT
885 888
886#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 889#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
887static __always_inline bool static_branch__true(struct static_key *key)
888{
889 return static_key_true(key); /* Not out of line branch. */
890}
891
892static __always_inline bool static_branch__false(struct static_key *key)
893{
894 return static_key_false(key); /* Out of line branch. */
895}
896
897#define SCHED_FEAT(name, enabled) \ 890#define SCHED_FEAT(name, enabled) \
898static __always_inline bool static_branch_##name(struct static_key *key) \ 891static __always_inline bool static_branch_##name(struct static_key *key) \
899{ \ 892{ \
900 return static_branch__##enabled(key); \ 893 return static_key_##enabled(key); \
901} 894}
902 895
903#include "features.h" 896#include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
1196extern void init_sched_fair_class(void); 1189extern void init_sched_fair_class(void);
1197extern void init_sched_dl_class(void); 1190extern void init_sched_dl_class(void);
1198 1191
1199extern void resched_task(struct task_struct *p); 1192extern void resched_curr(struct rq *rq);
1200extern void resched_cpu(int cpu); 1193extern void resched_cpu(int cpu);
1201 1194
1202extern struct rt_bandwidth def_rt_bandwidth; 1195extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
1218 1211
1219 rq->nr_running = prev_nr + count; 1212 rq->nr_running = prev_nr + count;
1220 1213
1221#ifdef CONFIG_NO_HZ_FULL
1222 if (prev_nr < 2 && rq->nr_running >= 2) { 1214 if (prev_nr < 2 && rq->nr_running >= 2) {
1215#ifdef CONFIG_SMP
1216 if (!rq->rd->overload)
1217 rq->rd->overload = true;
1218#endif
1219
1220#ifdef CONFIG_NO_HZ_FULL
1223 if (tick_nohz_full_cpu(rq->cpu)) { 1221 if (tick_nohz_full_cpu(rq->cpu)) {
1224 /* Order rq->nr_running write against the IPI */ 1222 /*
1225 smp_wmb(); 1223 * Tick is needed if more than one task runs on a CPU.
1226 smp_send_reschedule(rq->cpu); 1224 * Send the target an IPI to kick it out of nohz mode.
1225 *
1226 * We assume that IPI implies full memory barrier and the
1227 * new value of rq->nr_running is visible on reception
1228 * from the target.
1229 */
1230 tick_nohz_full_kick_cpu(rq->cpu);
1227 } 1231 }
1228 }
1229#endif 1232#endif
1233 }
1230} 1234}
1231 1235
1232static inline void sub_nr_running(struct rq *rq, unsigned count) 1236static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
319 */ 319 */
320int __sched 320int __sched
321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, 321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
322 int (*action)(void *), unsigned mode) 322 wait_bit_action_f *action, unsigned mode)
323{ 323{
324 int ret = 0; 324 int ret = 0;
325 325
326 do { 326 do {
327 prepare_to_wait(wq, &q->wait, mode); 327 prepare_to_wait(wq, &q->wait, mode);
328 if (test_bit(q->key.bit_nr, q->key.flags)) 328 if (test_bit(q->key.bit_nr, q->key.flags))
329 ret = (*action)(q->key.flags); 329 ret = (*action)(&q->key);
330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); 330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
331 finish_wait(wq, &q->wait); 331 finish_wait(wq, &q->wait);
332 return ret; 332 return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
334EXPORT_SYMBOL(__wait_on_bit); 334EXPORT_SYMBOL(__wait_on_bit);
335 335
336int __sched out_of_line_wait_on_bit(void *word, int bit, 336int __sched out_of_line_wait_on_bit(void *word, int bit,
337 int (*action)(void *), unsigned mode) 337 wait_bit_action_f *action, unsigned mode)
338{ 338{
339 wait_queue_head_t *wq = bit_waitqueue(word, bit); 339 wait_queue_head_t *wq = bit_waitqueue(word, bit);
340 DEFINE_WAIT_BIT(wait, word, bit); 340 DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched 346int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 int (*action)(void *), unsigned mode) 348 wait_bit_action_f *action, unsigned mode)
349{ 349{
350 do { 350 do {
351 int ret; 351 int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
353 prepare_to_wait_exclusive(wq, &q->wait, mode); 353 prepare_to_wait_exclusive(wq, &q->wait, mode);
354 if (!test_bit(q->key.bit_nr, q->key.flags)) 354 if (!test_bit(q->key.bit_nr, q->key.flags))
355 continue; 355 continue;
356 ret = action(q->key.flags); 356 ret = action(&q->key);
357 if (!ret) 357 if (!ret)
358 continue; 358 continue;
359 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 359 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
365EXPORT_SYMBOL(__wait_on_bit_lock); 365EXPORT_SYMBOL(__wait_on_bit_lock);
366 366
367int __sched out_of_line_wait_on_bit_lock(void *word, int bit, 367int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
368 int (*action)(void *), unsigned mode) 368 wait_bit_action_f *action, unsigned mode)
369{ 369{
370 wait_queue_head_t *wq = bit_waitqueue(word, bit); 370 wait_queue_head_t *wq = bit_waitqueue(word, bit);
371 DEFINE_WAIT_BIT(wait, word, bit); 371 DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); 502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
503} 503}
504EXPORT_SYMBOL(wake_up_atomic_t); 504EXPORT_SYMBOL(wake_up_atomic_t);
505
506__sched int bit_wait(struct wait_bit_key *word)
507{
508 if (signal_pending_state(current->state, current))
509 return 1;
510 schedule();
511 return 0;
512}
513EXPORT_SYMBOL(bit_wait);
514
515__sched int bit_wait_io(struct wait_bit_key *word)
516{
517 if (signal_pending_state(current->state, current))
518 return 1;
519 io_schedule();
520 return 0;
521}
522EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..487653b5844f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 */ 5 */
6#include <linux/irq_work.h>
6#include <linux/rcupdate.h> 7#include <linux/rcupdate.h>
7#include <linux/rculist.h> 8#include <linux/rculist.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
251 csd->func(csd->info); 252 csd->func(csd->info);
252 csd_unlock(csd); 253 csd_unlock(csd);
253 } 254 }
255
256 /*
257 * Handle irq works queued remotely by irq_work_queue_on().
258 * Smp functions above are typically synchronous so they
259 * better run first since some other CPUs may be busy waiting
260 * for them.
261 */
262 irq_work_run();
254} 263}
255 264
256/* 265/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f784d83e29f1..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -225,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
225}; 225};
226 226
227/* 227/*
228 * Kick the current CPU if it's full dynticks in order to force it to 228 * Kick the CPU if it's full dynticks in order to force it to
229 * re-evaluate its dependency on the tick and restart it if necessary. 229 * re-evaluate its dependency on the tick and restart it if necessary.
230 */ 230 */
231void tick_nohz_full_kick(void) 231void tick_nohz_full_kick_cpu(int cpu)
232{ 232{
233 if (tick_nohz_full_cpu(smp_processor_id())) 233 if (!tick_nohz_full_cpu(cpu))
234 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 234 return;
235
236 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
235} 237}
236 238
237static void nohz_full_kick_ipi(void *info) 239static void nohz_full_kick_ipi(void *info)