diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-08-04 19:23:30 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-08-04 19:23:30 -0400 |
commit | 98959948a7ba33cf8c708626e0d2a1456397e1c6 (patch) | |
tree | 8ba9b6c2679a06e89f23bdd7018e9bb0249e3bda /kernel | |
parent | ef35ad26f8ff44d2c93e29952cdb336bda729d9d (diff) | |
parent | cd3bd4e628a6d57d66afe77835fe8d93ae3e41f8 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Move the nohz kick code out of the scheduler tick to a dedicated IPI,
from Frederic Weisbecker.
This necessiated quite some background infrastructure rework,
including:
* Clean up some irq-work internals
* Implement remote irq-work
* Implement nohz kick on top of remote irq-work
* Move full dynticks timer enqueue notification to new kick
* Move multi-task notification to new kick
* Remove unecessary barriers on multi-task notification
- Remove proliferation of wait_on_bit() action functions and allow
wait_on_bit_action() functions to support a timeout. (Neil Brown)
- Another round of sched/numa improvements, cleanups and fixes. (Rik
van Riel)
- Implement fast idling of CPUs when the system is partially loaded,
for better scalability. (Tim Chen)
- Restructure and fix the CPU hotplug handling code that may leave
cfs_rq and rt_rq's throttled when tasks are migrated away from a dead
cpu. (Kirill Tkhai)
- Robustify the sched topology setup code. (Peterz Zijlstra)
- Improve sched_feat() handling wrt. static_keys (Jason Baron)
- Misc fixes.
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
sched/fair: Fix 'make xmldocs' warning caused by missing description
sched: Use macro for magic number of -1 for setparam
sched: Robustify topology setup
sched: Fix sched_setparam() policy == -1 logic
sched: Allow wait_on_bit_action() functions to support a timeout
sched: Remove proliferation of wait_on_bit() action functions
sched/numa: Revert "Use effective_load() to balance NUMA loads"
sched: Fix static_key race with sched_feat()
sched: Remove extra static_key*() function indirection
sched/rt: Fix replenish_dl_entity() comments to match the current upstream code
sched: Transform resched_task() into resched_curr()
sched/deadline: Kill task_struct->pi_top_task
sched: Rework check_for_tasks()
sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime()
sched/fair: Disable runtime_enabled on dying rq
sched/numa: Change scan period code to match intent
sched/numa: Rework best node setting in task_numa_migrate()
sched/numa: Examine a task move when examining a task swap
sched/numa: Simplify task_numa_compare()
sched/numa: Use effective_load() to balance NUMA loads
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 33 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/irq_work.c | 110 | ||||
-rw-r--r-- | kernel/ptrace.c | 8 | ||||
-rw-r--r-- | kernel/sched/core.c | 119 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 18 | ||||
-rw-r--r-- | kernel/sched/fair.c | 244 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 30 | ||||
-rw-r--r-- | kernel/sched/sched.h | 38 | ||||
-rw-r--r-- | kernel/sched/wait.c | 30 | ||||
-rw-r--r-- | kernel/smp.c | 9 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 10 |
14 files changed, 382 insertions, 274 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde710b1..81e2a388a0f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu) | |||
274 | rcu_read_unlock(); | 274 | rcu_read_unlock(); |
275 | } | 275 | } |
276 | 276 | ||
277 | static inline void check_for_tasks(int cpu) | 277 | static inline void check_for_tasks(int dead_cpu) |
278 | { | 278 | { |
279 | struct task_struct *p; | 279 | struct task_struct *g, *p; |
280 | cputime_t utime, stime; | ||
281 | 280 | ||
282 | write_lock_irq(&tasklist_lock); | 281 | read_lock_irq(&tasklist_lock); |
283 | for_each_process(p) { | 282 | do_each_thread(g, p) { |
284 | task_cputime(p, &utime, &stime); | 283 | if (!p->on_rq) |
285 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 284 | continue; |
286 | (utime || stime)) | 285 | /* |
287 | pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", | 286 | * We do the check with unlocked task_rq(p)->lock. |
288 | p->comm, task_pid_nr(p), cpu, | 287 | * Order the reading to do not warn about a task, |
289 | p->state, p->flags); | 288 | * which was running on this cpu in the past, and |
290 | } | 289 | * it's just been woken on another cpu. |
291 | write_unlock_irq(&tasklist_lock); | 290 | */ |
291 | rmb(); | ||
292 | if (task_cpu(p) != dead_cpu) | ||
293 | continue; | ||
294 | |||
295 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", | ||
296 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); | ||
297 | } while_each_thread(g, p); | ||
298 | read_unlock_irq(&tasklist_lock); | ||
292 | } | 299 | } |
293 | 300 | ||
294 | struct take_cpu_down_param { | 301 | struct take_cpu_down_param { |
diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46cd87d..962885edbe53 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1095 | p->pi_waiters = RB_ROOT; | 1095 | p->pi_waiters = RB_ROOT; |
1096 | p->pi_waiters_leftmost = NULL; | 1096 | p->pi_waiters_leftmost = NULL; |
1097 | p->pi_blocked_on = NULL; | 1097 | p->pi_blocked_on = NULL; |
1098 | p->pi_top_task = NULL; | ||
1099 | #endif | 1098 | #endif |
1100 | } | 1099 | } |
1101 | 1100 | ||
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e2fa78..e6bcbe756663 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -16,11 +16,12 @@ | |||
16 | #include <linux/tick.h> | 16 | #include <linux/tick.h> |
17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | ||
19 | #include <asm/processor.h> | 20 | #include <asm/processor.h> |
20 | 21 | ||
21 | 22 | ||
22 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); | 23 | static DEFINE_PER_CPU(struct llist_head, raised_list); |
23 | static DEFINE_PER_CPU(int, irq_work_raised); | 24 | static DEFINE_PER_CPU(struct llist_head, lazy_list); |
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Claim the entry so that no one else will poke at it. | 27 | * Claim the entry so that no one else will poke at it. |
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) | |||
55 | */ | 56 | */ |
56 | } | 57 | } |
57 | 58 | ||
59 | #ifdef CONFIG_SMP | ||
58 | /* | 60 | /* |
59 | * Enqueue the irq_work @entry unless it's already pending | 61 | * Enqueue the irq_work @work on @cpu unless it's already pending |
60 | * somewhere. | 62 | * somewhere. |
61 | * | 63 | * |
62 | * Can be re-enqueued while the callback is still in progress. | 64 | * Can be re-enqueued while the callback is still in progress. |
63 | */ | 65 | */ |
66 | bool irq_work_queue_on(struct irq_work *work, int cpu) | ||
67 | { | ||
68 | /* All work should have been flushed before going offline */ | ||
69 | WARN_ON_ONCE(cpu_is_offline(cpu)); | ||
70 | |||
71 | /* Arch remote IPI send/receive backend aren't NMI safe */ | ||
72 | WARN_ON_ONCE(in_nmi()); | ||
73 | |||
74 | /* Only queue if not already pending */ | ||
75 | if (!irq_work_claim(work)) | ||
76 | return false; | ||
77 | |||
78 | if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) | ||
79 | arch_send_call_function_single_ipi(cpu); | ||
80 | |||
81 | return true; | ||
82 | } | ||
83 | EXPORT_SYMBOL_GPL(irq_work_queue_on); | ||
84 | #endif | ||
85 | |||
86 | /* Enqueue the irq work @work on the current CPU */ | ||
64 | bool irq_work_queue(struct irq_work *work) | 87 | bool irq_work_queue(struct irq_work *work) |
65 | { | 88 | { |
66 | /* Only queue if not already pending */ | 89 | /* Only queue if not already pending */ |
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work) | |||
70 | /* Queue the entry and raise the IPI if needed. */ | 93 | /* Queue the entry and raise the IPI if needed. */ |
71 | preempt_disable(); | 94 | preempt_disable(); |
72 | 95 | ||
73 | llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); | 96 | /* If the work is "lazy", handle it from next tick if any */ |
74 | 97 | if (work->flags & IRQ_WORK_LAZY) { | |
75 | /* | 98 | if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && |
76 | * If the work is not "lazy" or the tick is stopped, raise the irq | 99 | tick_nohz_tick_stopped()) |
77 | * work interrupt (if supported by the arch), otherwise, just wait | 100 | arch_irq_work_raise(); |
78 | * for the next tick. | 101 | } else { |
79 | */ | 102 | if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) |
80 | if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { | ||
81 | if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) | ||
82 | arch_irq_work_raise(); | 103 | arch_irq_work_raise(); |
83 | } | 104 | } |
84 | 105 | ||
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
90 | 111 | ||
91 | bool irq_work_needs_cpu(void) | 112 | bool irq_work_needs_cpu(void) |
92 | { | 113 | { |
93 | struct llist_head *this_list; | 114 | struct llist_head *raised, *lazy; |
94 | 115 | ||
95 | this_list = &__get_cpu_var(irq_work_list); | 116 | raised = &__get_cpu_var(raised_list); |
96 | if (llist_empty(this_list)) | 117 | lazy = &__get_cpu_var(lazy_list); |
118 | if (llist_empty(raised) && llist_empty(lazy)) | ||
97 | return false; | 119 | return false; |
98 | 120 | ||
99 | /* All work should have been flushed before going offline */ | 121 | /* All work should have been flushed before going offline */ |
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void) | |||
102 | return true; | 124 | return true; |
103 | } | 125 | } |
104 | 126 | ||
105 | static void __irq_work_run(void) | 127 | static void irq_work_run_list(struct llist_head *list) |
106 | { | 128 | { |
107 | unsigned long flags; | 129 | unsigned long flags; |
108 | struct irq_work *work; | 130 | struct irq_work *work; |
109 | struct llist_head *this_list; | ||
110 | struct llist_node *llnode; | 131 | struct llist_node *llnode; |
111 | 132 | ||
133 | BUG_ON(!irqs_disabled()); | ||
112 | 134 | ||
113 | /* | 135 | if (llist_empty(list)) |
114 | * Reset the "raised" state right before we check the list because | ||
115 | * an NMI may enqueue after we find the list empty from the runner. | ||
116 | */ | ||
117 | __this_cpu_write(irq_work_raised, 0); | ||
118 | barrier(); | ||
119 | |||
120 | this_list = &__get_cpu_var(irq_work_list); | ||
121 | if (llist_empty(this_list)) | ||
122 | return; | 136 | return; |
123 | 137 | ||
124 | BUG_ON(!irqs_disabled()); | 138 | llnode = llist_del_all(list); |
125 | |||
126 | llnode = llist_del_all(this_list); | ||
127 | while (llnode != NULL) { | 139 | while (llnode != NULL) { |
128 | work = llist_entry(llnode, struct irq_work, llnode); | 140 | work = llist_entry(llnode, struct irq_work, llnode); |
129 | 141 | ||
@@ -149,13 +161,13 @@ static void __irq_work_run(void) | |||
149 | } | 161 | } |
150 | 162 | ||
151 | /* | 163 | /* |
152 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | 164 | * hotplug calls this through: |
153 | * context with local IRQs disabled. | 165 | * hotplug_cfd() -> flush_smp_call_function_queue() |
154 | */ | 166 | */ |
155 | void irq_work_run(void) | 167 | void irq_work_run(void) |
156 | { | 168 | { |
157 | BUG_ON(!in_irq()); | 169 | irq_work_run_list(&__get_cpu_var(raised_list)); |
158 | __irq_work_run(); | 170 | irq_work_run_list(&__get_cpu_var(lazy_list)); |
159 | } | 171 | } |
160 | EXPORT_SYMBOL_GPL(irq_work_run); | 172 | EXPORT_SYMBOL_GPL(irq_work_run); |
161 | 173 | ||
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work) | |||
171 | cpu_relax(); | 183 | cpu_relax(); |
172 | } | 184 | } |
173 | EXPORT_SYMBOL_GPL(irq_work_sync); | 185 | EXPORT_SYMBOL_GPL(irq_work_sync); |
174 | |||
175 | #ifdef CONFIG_HOTPLUG_CPU | ||
176 | static int irq_work_cpu_notify(struct notifier_block *self, | ||
177 | unsigned long action, void *hcpu) | ||
178 | { | ||
179 | long cpu = (long)hcpu; | ||
180 | |||
181 | switch (action) { | ||
182 | case CPU_DYING: | ||
183 | /* Called from stop_machine */ | ||
184 | if (WARN_ON_ONCE(cpu != smp_processor_id())) | ||
185 | break; | ||
186 | __irq_work_run(); | ||
187 | break; | ||
188 | default: | ||
189 | break; | ||
190 | } | ||
191 | return NOTIFY_OK; | ||
192 | } | ||
193 | |||
194 | static struct notifier_block cpu_notify; | ||
195 | |||
196 | static __init int irq_work_init_cpu_notifier(void) | ||
197 | { | ||
198 | cpu_notify.notifier_call = irq_work_cpu_notify; | ||
199 | cpu_notify.priority = 0; | ||
200 | register_cpu_notifier(&cpu_notify); | ||
201 | return 0; | ||
202 | } | ||
203 | device_initcall(irq_work_init_cpu_notifier); | ||
204 | |||
205 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf98622cb32..54e75226c2c4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -28,12 +28,6 @@ | |||
28 | #include <linux/compat.h> | 28 | #include <linux/compat.h> |
29 | 29 | ||
30 | 30 | ||
31 | static int ptrace_trapping_sleep_fn(void *flags) | ||
32 | { | ||
33 | schedule(); | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | 31 | /* |
38 | * ptrace a task: make the debugger its new parent and | 32 | * ptrace a task: make the debugger its new parent and |
39 | * move it to the ptrace list. | 33 | * move it to the ptrace list. |
@@ -371,7 +365,7 @@ unlock_creds: | |||
371 | out: | 365 | out: |
372 | if (!retval) { | 366 | if (!retval) { |
373 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, | 367 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, |
374 | ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); | 368 | TASK_UNINTERRUPTIBLE); |
375 | proc_ptrace_connector(task, PTRACE_ATTACH); | 369 | proc_ptrace_connector(task, PTRACE_ATTACH); |
376 | } | 370 | } |
377 | 371 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 126f7e3f04e7..1211575a2208 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq) | |||
139 | return; | 139 | return; |
140 | 140 | ||
141 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 141 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
142 | if (delta < 0) | ||
143 | return; | ||
142 | rq->clock += delta; | 144 | rq->clock += delta; |
143 | update_rq_clock_task(rq, delta); | 145 | update_rq_clock_task(rq, delta); |
144 | } | 146 | } |
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
243 | char buf[64]; | 245 | char buf[64]; |
244 | char *cmp; | 246 | char *cmp; |
245 | int i; | 247 | int i; |
248 | struct inode *inode; | ||
246 | 249 | ||
247 | if (cnt > 63) | 250 | if (cnt > 63) |
248 | cnt = 63; | 251 | cnt = 63; |
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
253 | buf[cnt] = 0; | 256 | buf[cnt] = 0; |
254 | cmp = strstrip(buf); | 257 | cmp = strstrip(buf); |
255 | 258 | ||
259 | /* Ensure the static_key remains in a consistent state */ | ||
260 | inode = file_inode(filp); | ||
261 | mutex_lock(&inode->i_mutex); | ||
256 | i = sched_feat_set(cmp); | 262 | i = sched_feat_set(cmp); |
263 | mutex_unlock(&inode->i_mutex); | ||
257 | if (i == __SCHED_FEAT_NR) | 264 | if (i == __SCHED_FEAT_NR) |
258 | return -EINVAL; | 265 | return -EINVAL; |
259 | 266 | ||
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p) | |||
587 | #endif | 594 | #endif |
588 | 595 | ||
589 | /* | 596 | /* |
590 | * resched_task - mark a task 'to be rescheduled now'. | 597 | * resched_curr - mark rq's current task 'to be rescheduled now'. |
591 | * | 598 | * |
592 | * On UP this means the setting of the need_resched flag, on SMP it | 599 | * On UP this means the setting of the need_resched flag, on SMP it |
593 | * might also involve a cross-CPU call to trigger the scheduler on | 600 | * might also involve a cross-CPU call to trigger the scheduler on |
594 | * the target CPU. | 601 | * the target CPU. |
595 | */ | 602 | */ |
596 | void resched_task(struct task_struct *p) | 603 | void resched_curr(struct rq *rq) |
597 | { | 604 | { |
605 | struct task_struct *curr = rq->curr; | ||
598 | int cpu; | 606 | int cpu; |
599 | 607 | ||
600 | lockdep_assert_held(&task_rq(p)->lock); | 608 | lockdep_assert_held(&rq->lock); |
601 | 609 | ||
602 | if (test_tsk_need_resched(p)) | 610 | if (test_tsk_need_resched(curr)) |
603 | return; | 611 | return; |
604 | 612 | ||
605 | cpu = task_cpu(p); | 613 | cpu = cpu_of(rq); |
606 | 614 | ||
607 | if (cpu == smp_processor_id()) { | 615 | if (cpu == smp_processor_id()) { |
608 | set_tsk_need_resched(p); | 616 | set_tsk_need_resched(curr); |
609 | set_preempt_need_resched(); | 617 | set_preempt_need_resched(); |
610 | return; | 618 | return; |
611 | } | 619 | } |
612 | 620 | ||
613 | if (set_nr_and_not_polling(p)) | 621 | if (set_nr_and_not_polling(curr)) |
614 | smp_send_reschedule(cpu); | 622 | smp_send_reschedule(cpu); |
615 | else | 623 | else |
616 | trace_sched_wake_idle_without_ipi(cpu); | 624 | trace_sched_wake_idle_without_ipi(cpu); |
@@ -623,7 +631,7 @@ void resched_cpu(int cpu) | |||
623 | 631 | ||
624 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) | 632 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) |
625 | return; | 633 | return; |
626 | resched_task(cpu_curr(cpu)); | 634 | resched_curr(rq); |
627 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 635 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
628 | } | 636 | } |
629 | 637 | ||
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu) | |||
684 | 692 | ||
685 | static bool wake_up_full_nohz_cpu(int cpu) | 693 | static bool wake_up_full_nohz_cpu(int cpu) |
686 | { | 694 | { |
695 | /* | ||
696 | * We just need the target to call irq_exit() and re-evaluate | ||
697 | * the next tick. The nohz full kick at least implies that. | ||
698 | * If needed we can still optimize that later with an | ||
699 | * empty IRQ. | ||
700 | */ | ||
687 | if (tick_nohz_full_cpu(cpu)) { | 701 | if (tick_nohz_full_cpu(cpu)) { |
688 | if (cpu != smp_processor_id() || | 702 | if (cpu != smp_processor_id() || |
689 | tick_nohz_tick_stopped()) | 703 | tick_nohz_tick_stopped()) |
690 | smp_send_reschedule(cpu); | 704 | tick_nohz_full_kick_cpu(cpu); |
691 | return true; | 705 | return true; |
692 | } | 706 | } |
693 | 707 | ||
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void) | |||
730 | #ifdef CONFIG_NO_HZ_FULL | 744 | #ifdef CONFIG_NO_HZ_FULL |
731 | bool sched_can_stop_tick(void) | 745 | bool sched_can_stop_tick(void) |
732 | { | 746 | { |
733 | struct rq *rq; | 747 | /* |
734 | 748 | * More than one running task need preemption. | |
735 | rq = this_rq(); | 749 | * nr_running update is assumed to be visible |
736 | 750 | * after IPI is sent from wakers. | |
737 | /* Make sure rq->nr_running update is visible after the IPI */ | 751 | */ |
738 | smp_rmb(); | 752 | if (this_rq()->nr_running > 1) |
739 | 753 | return false; | |
740 | /* More than one running task need preemption */ | ||
741 | if (rq->nr_running > 1) | ||
742 | return false; | ||
743 | 754 | ||
744 | return true; | 755 | return true; |
745 | } | 756 | } |
746 | #endif /* CONFIG_NO_HZ_FULL */ | 757 | #endif /* CONFIG_NO_HZ_FULL */ |
747 | 758 | ||
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
1022 | if (class == rq->curr->sched_class) | 1033 | if (class == rq->curr->sched_class) |
1023 | break; | 1034 | break; |
1024 | if (class == p->sched_class) { | 1035 | if (class == p->sched_class) { |
1025 | resched_task(rq->curr); | 1036 | resched_curr(rq); |
1026 | break; | 1037 | break; |
1027 | } | 1038 | } |
1028 | } | 1039 | } |
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void) | |||
1568 | */ | 1579 | */ |
1569 | preempt_fold_need_resched(); | 1580 | preempt_fold_need_resched(); |
1570 | 1581 | ||
1571 | if (llist_empty(&this_rq()->wake_list) | 1582 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
1572 | && !tick_nohz_full_cpu(smp_processor_id()) | ||
1573 | && !got_nohz_idle_kick()) | ||
1574 | return; | 1583 | return; |
1575 | 1584 | ||
1576 | /* | 1585 | /* |
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void) | |||
1587 | * somewhat pessimize the simple resched case. | 1596 | * somewhat pessimize the simple resched case. |
1588 | */ | 1597 | */ |
1589 | irq_enter(); | 1598 | irq_enter(); |
1590 | tick_nohz_full_check(); | ||
1591 | sched_ttwu_pending(); | 1599 | sched_ttwu_pending(); |
1592 | 1600 | ||
1593 | /* | 1601 | /* |
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
2431 | { | 2439 | { |
2432 | u64 ns = 0; | 2440 | u64 ns = 0; |
2433 | 2441 | ||
2434 | if (task_current(rq, p)) { | 2442 | /* |
2443 | * Must be ->curr _and_ ->on_rq. If dequeued, we would | ||
2444 | * project cycles that may never be accounted to this | ||
2445 | * thread, breaking clock_gettime(). | ||
2446 | */ | ||
2447 | if (task_current(rq, p) && p->on_rq) { | ||
2435 | update_rq_clock(rq); | 2448 | update_rq_clock(rq); |
2436 | ns = rq_clock_task(rq) - p->se.exec_start; | 2449 | ns = rq_clock_task(rq) - p->se.exec_start; |
2437 | if ((s64)ns < 0) | 2450 | if ((s64)ns < 0) |
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2474 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | 2487 | * If we race with it leaving cpu, we'll take a lock. So we're correct. |
2475 | * If we race with it entering cpu, unaccounted time is 0. This is | 2488 | * If we race with it entering cpu, unaccounted time is 0. This is |
2476 | * indistinguishable from the read occurring a few cycles earlier. | 2489 | * indistinguishable from the read occurring a few cycles earlier. |
2490 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | ||
2491 | * been accounted, so we're correct here as well. | ||
2477 | */ | 2492 | */ |
2478 | if (!p->on_cpu) | 2493 | if (!p->on_cpu || !p->on_rq) |
2479 | return p->se.sum_exec_runtime; | 2494 | return p->se.sum_exec_runtime; |
2480 | #endif | 2495 | #endif |
2481 | 2496 | ||
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2971 | } | 2986 | } |
2972 | 2987 | ||
2973 | trace_sched_pi_setprio(p, prio); | 2988 | trace_sched_pi_setprio(p, prio); |
2974 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
2975 | oldprio = p->prio; | 2989 | oldprio = p->prio; |
2976 | prev_class = p->sched_class; | 2990 | prev_class = p->sched_class; |
2977 | on_rq = p->on_rq; | 2991 | on_rq = p->on_rq; |
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2991 | * running task | 3005 | * running task |
2992 | */ | 3006 | */ |
2993 | if (dl_prio(prio)) { | 3007 | if (dl_prio(prio)) { |
2994 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | 3008 | struct task_struct *pi_task = rt_mutex_get_top_task(p); |
2995 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | 3009 | if (!dl_prio(p->normal_prio) || |
3010 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | ||
2996 | p->dl.dl_boosted = 1; | 3011 | p->dl.dl_boosted = 1; |
2997 | p->dl.dl_throttled = 0; | 3012 | p->dl.dl_throttled = 0; |
2998 | enqueue_flag = ENQUEUE_REPLENISH; | 3013 | enqueue_flag = ENQUEUE_REPLENISH; |
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3064 | * lowered its priority, then reschedule its CPU: | 3079 | * lowered its priority, then reschedule its CPU: |
3065 | */ | 3080 | */ |
3066 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3081 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3067 | resched_task(rq->curr); | 3082 | resched_curr(rq); |
3068 | } | 3083 | } |
3069 | out_unlock: | 3084 | out_unlock: |
3070 | task_rq_unlock(rq, p, &flags); | 3085 | task_rq_unlock(rq, p, &flags); |
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | |||
3203 | dl_se->dl_yielded = 0; | 3218 | dl_se->dl_yielded = 0; |
3204 | } | 3219 | } |
3205 | 3220 | ||
3221 | /* | ||
3222 | * sched_setparam() passes in -1 for its policy, to let the functions | ||
3223 | * it calls know not to change it. | ||
3224 | */ | ||
3225 | #define SETPARAM_POLICY -1 | ||
3226 | |||
3206 | static void __setscheduler_params(struct task_struct *p, | 3227 | static void __setscheduler_params(struct task_struct *p, |
3207 | const struct sched_attr *attr) | 3228 | const struct sched_attr *attr) |
3208 | { | 3229 | { |
3209 | int policy = attr->sched_policy; | 3230 | int policy = attr->sched_policy; |
3210 | 3231 | ||
3211 | if (policy == -1) /* setparam */ | 3232 | if (policy == SETPARAM_POLICY) |
3212 | policy = p->policy; | 3233 | policy = p->policy; |
3213 | 3234 | ||
3214 | p->policy = policy; | 3235 | p->policy = policy; |
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, | |||
3557 | .sched_nice = PRIO_TO_NICE(p->static_prio), | 3578 | .sched_nice = PRIO_TO_NICE(p->static_prio), |
3558 | }; | 3579 | }; |
3559 | 3580 | ||
3560 | /* | 3581 | /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ |
3561 | * Fixup the legacy SCHED_RESET_ON_FORK hack | 3582 | if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { |
3562 | */ | ||
3563 | if (policy & SCHED_RESET_ON_FORK) { | ||
3564 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | 3583 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; |
3565 | policy &= ~SCHED_RESET_ON_FORK; | 3584 | policy &= ~SCHED_RESET_ON_FORK; |
3566 | attr.sched_policy = policy; | 3585 | attr.sched_policy = policy; |
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | |||
3730 | */ | 3749 | */ |
3731 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 3750 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
3732 | { | 3751 | { |
3733 | return do_sched_setscheduler(pid, -1, param); | 3752 | return do_sched_setscheduler(pid, SETPARAM_POLICY, param); |
3734 | } | 3753 | } |
3735 | 3754 | ||
3736 | /** | 3755 | /** |
@@ -4285,7 +4304,7 @@ again: | |||
4285 | * fairness. | 4304 | * fairness. |
4286 | */ | 4305 | */ |
4287 | if (preempt && rq != p_rq) | 4306 | if (preempt && rq != p_rq) |
4288 | resched_task(p_rq->curr); | 4307 | resched_curr(p_rq); |
4289 | } | 4308 | } |
4290 | 4309 | ||
4291 | out_unlock: | 4310 | out_unlock: |
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6465 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6484 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6466 | child->parent = sd; | 6485 | child->parent = sd; |
6467 | sd->child = child; | 6486 | sd->child = child; |
6487 | |||
6488 | if (!cpumask_subset(sched_domain_span(child), | ||
6489 | sched_domain_span(sd))) { | ||
6490 | pr_err("BUG: arch topology borken\n"); | ||
6491 | #ifdef CONFIG_SCHED_DEBUG | ||
6492 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
6493 | child->name, sd->name); | ||
6494 | #endif | ||
6495 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
6496 | cpumask_or(sched_domain_span(sd), | ||
6497 | sched_domain_span(sd), | ||
6498 | sched_domain_span(child)); | ||
6499 | } | ||
6500 | |||
6468 | } | 6501 | } |
6469 | set_domain_attribute(sd, attr); | 6502 | set_domain_attribute(sd, attr); |
6470 | 6503 | ||
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
7092 | __setscheduler(rq, p, &attr); | 7125 | __setscheduler(rq, p, &attr); |
7093 | if (on_rq) { | 7126 | if (on_rq) { |
7094 | enqueue_task(rq, p, 0); | 7127 | enqueue_task(rq, p, 0); |
7095 | resched_task(rq->curr); | 7128 | resched_curr(rq); |
7096 | } | 7129 | } |
7097 | 7130 | ||
7098 | check_class_changed(rq, p, prev_class, old_prio); | 7131 | check_class_changed(rq, p, prev_class, old_prio); |
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7803 | if (period > max_cfs_quota_period) | 7836 | if (period > max_cfs_quota_period) |
7804 | return -EINVAL; | 7837 | return -EINVAL; |
7805 | 7838 | ||
7839 | /* | ||
7840 | * Prevent race between setting of cfs_rq->runtime_enabled and | ||
7841 | * unthrottle_offline_cfs_rqs(). | ||
7842 | */ | ||
7843 | get_online_cpus(); | ||
7806 | mutex_lock(&cfs_constraints_mutex); | 7844 | mutex_lock(&cfs_constraints_mutex); |
7807 | ret = __cfs_schedulable(tg, period, quota); | 7845 | ret = __cfs_schedulable(tg, period, quota); |
7808 | if (ret) | 7846 | if (ret) |
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7828 | } | 7866 | } |
7829 | raw_spin_unlock_irq(&cfs_b->lock); | 7867 | raw_spin_unlock_irq(&cfs_b->lock); |
7830 | 7868 | ||
7831 | for_each_possible_cpu(i) { | 7869 | for_each_online_cpu(i) { |
7832 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7870 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
7833 | struct rq *rq = cfs_rq->rq; | 7871 | struct rq *rq = cfs_rq->rq; |
7834 | 7872 | ||
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7844 | cfs_bandwidth_usage_dec(); | 7882 | cfs_bandwidth_usage_dec(); |
7845 | out_unlock: | 7883 | out_unlock: |
7846 | mutex_unlock(&cfs_constraints_mutex); | 7884 | mutex_unlock(&cfs_constraints_mutex); |
7885 | put_online_cpus(); | ||
7847 | 7886 | ||
7848 | return ret; | 7887 | return ret; |
7849 | } | 7888 | } |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..255ce138b652 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
306 | * the overrunning entity can't interfere with other entity in the system and | 306 | * the overrunning entity can't interfere with other entity in the system and |
307 | * can't make them miss their deadlines. Reasons why this kind of overruns | 307 | * can't make them miss their deadlines. Reasons why this kind of overruns |
308 | * could happen are, typically, a entity voluntarily trying to overcome its | 308 | * could happen are, typically, a entity voluntarily trying to overcome its |
309 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | 309 | * runtime, or it just underestimated it during sched_setattr(). |
310 | */ | 310 | */ |
311 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | 311 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, |
312 | struct sched_dl_entity *pi_se) | 312 | struct sched_dl_entity *pi_se) |
@@ -535,7 +535,7 @@ again: | |||
535 | if (task_has_dl_policy(rq->curr)) | 535 | if (task_has_dl_policy(rq->curr)) |
536 | check_preempt_curr_dl(rq, p, 0); | 536 | check_preempt_curr_dl(rq, p, 0); |
537 | else | 537 | else |
538 | resched_task(rq->curr); | 538 | resched_curr(rq); |
539 | #ifdef CONFIG_SMP | 539 | #ifdef CONFIG_SMP |
540 | /* | 540 | /* |
541 | * Queueing this task back might have overloaded rq, | 541 | * Queueing this task back might have overloaded rq, |
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq) | |||
634 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | 634 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); |
635 | 635 | ||
636 | if (!is_leftmost(curr, &rq->dl)) | 636 | if (!is_leftmost(curr, &rq->dl)) |
637 | resched_task(curr); | 637 | resched_curr(rq); |
638 | } | 638 | } |
639 | 639 | ||
640 | /* | 640 | /* |
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | |||
964 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | 964 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) |
965 | return; | 965 | return; |
966 | 966 | ||
967 | resched_task(rq->curr); | 967 | resched_curr(rq); |
968 | } | 968 | } |
969 | 969 | ||
970 | static int pull_dl_task(struct rq *this_rq); | 970 | static int pull_dl_task(struct rq *this_rq); |
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
979 | int flags) | 979 | int flags) |
980 | { | 980 | { |
981 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | 981 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { |
982 | resched_task(rq->curr); | 982 | resched_curr(rq); |
983 | return; | 983 | return; |
984 | } | 984 | } |
985 | 985 | ||
@@ -1333,7 +1333,7 @@ retry: | |||
1333 | if (dl_task(rq->curr) && | 1333 | if (dl_task(rq->curr) && |
1334 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | 1334 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && |
1335 | rq->curr->nr_cpus_allowed > 1) { | 1335 | rq->curr->nr_cpus_allowed > 1) { |
1336 | resched_task(rq->curr); | 1336 | resched_curr(rq); |
1337 | return 0; | 1337 | return 0; |
1338 | } | 1338 | } |
1339 | 1339 | ||
@@ -1373,7 +1373,7 @@ retry: | |||
1373 | set_task_cpu(next_task, later_rq->cpu); | 1373 | set_task_cpu(next_task, later_rq->cpu); |
1374 | activate_task(later_rq, next_task, 0); | 1374 | activate_task(later_rq, next_task, 0); |
1375 | 1375 | ||
1376 | resched_task(later_rq->curr); | 1376 | resched_curr(later_rq); |
1377 | 1377 | ||
1378 | double_unlock_balance(rq, later_rq); | 1378 | double_unlock_balance(rq, later_rq); |
1379 | 1379 | ||
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
1632 | */ | 1632 | */ |
1633 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | 1633 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && |
1634 | rq->curr == p) | 1634 | rq->curr == p) |
1635 | resched_task(p); | 1635 | resched_curr(rq); |
1636 | #else | 1636 | #else |
1637 | /* | 1637 | /* |
1638 | * Again, we don't know if p has a earlier | 1638 | * Again, we don't know if p has a earlier |
1639 | * or later deadline, so let's blindly set a | 1639 | * or later deadline, so let's blindly set a |
1640 | * (maybe not needed) rescheduling point. | 1640 | * (maybe not needed) rescheduling point. |
1641 | */ | 1641 | */ |
1642 | resched_task(p); | 1642 | resched_curr(rq); |
1643 | #endif /* CONFIG_SMP */ | 1643 | #endif /* CONFIG_SMP */ |
1644 | } else | 1644 | } else |
1645 | switched_to_dl(rq, p); | 1645 | switched_to_dl(rq, p); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1062 | if (!cpus) | 1062 | if (!cpus) |
1063 | return; | 1063 | return; |
1064 | 1064 | ||
1065 | ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; | ||
1066 | ns->task_capacity = | 1065 | ns->task_capacity = |
1067 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); |
1068 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1096 | env->best_cpu = env->dst_cpu; | 1095 | env->best_cpu = env->dst_cpu; |
1097 | } | 1096 | } |
1098 | 1097 | ||
1099 | static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | 1098 | static bool load_too_imbalanced(long src_load, long dst_load, |
1100 | long src_load, long dst_load, | ||
1101 | struct task_numa_env *env) | 1099 | struct task_numa_env *env) |
1102 | { | 1100 | { |
1103 | long imb, old_imb; | 1101 | long imb, old_imb; |
1102 | long orig_src_load, orig_dst_load; | ||
1103 | long src_capacity, dst_capacity; | ||
1104 | |||
1105 | /* | ||
1106 | * The load is corrected for the CPU capacity available on each node. | ||
1107 | * | ||
1108 | * src_load dst_load | ||
1109 | * ------------ vs --------- | ||
1110 | * src_capacity dst_capacity | ||
1111 | */ | ||
1112 | src_capacity = env->src_stats.compute_capacity; | ||
1113 | dst_capacity = env->dst_stats.compute_capacity; | ||
1104 | 1114 | ||
1105 | /* We care about the slope of the imbalance, not the direction. */ | 1115 | /* We care about the slope of the imbalance, not the direction. */ |
1106 | if (dst_load < src_load) | 1116 | if (dst_load < src_load) |
1107 | swap(dst_load, src_load); | 1117 | swap(dst_load, src_load); |
1108 | 1118 | ||
1109 | /* Is the difference below the threshold? */ | 1119 | /* Is the difference below the threshold? */ |
1110 | imb = dst_load * 100 - src_load * env->imbalance_pct; | 1120 | imb = dst_load * src_capacity * 100 - |
1121 | src_load * dst_capacity * env->imbalance_pct; | ||
1111 | if (imb <= 0) | 1122 | if (imb <= 0) |
1112 | return false; | 1123 | return false; |
1113 | 1124 | ||
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | |||
1115 | * The imbalance is above the allowed threshold. | 1126 | * The imbalance is above the allowed threshold. |
1116 | * Compare it with the old imbalance. | 1127 | * Compare it with the old imbalance. |
1117 | */ | 1128 | */ |
1129 | orig_src_load = env->src_stats.load; | ||
1130 | orig_dst_load = env->dst_stats.load; | ||
1131 | |||
1118 | if (orig_dst_load < orig_src_load) | 1132 | if (orig_dst_load < orig_src_load) |
1119 | swap(orig_dst_load, orig_src_load); | 1133 | swap(orig_dst_load, orig_src_load); |
1120 | 1134 | ||
1121 | old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; | 1135 | old_imb = orig_dst_load * src_capacity * 100 - |
1136 | orig_src_load * dst_capacity * env->imbalance_pct; | ||
1122 | 1137 | ||
1123 | /* Would this change make things worse? */ | 1138 | /* Would this change make things worse? */ |
1124 | return (imb > old_imb); | 1139 | return (imb > old_imb); |
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1136 | struct rq *src_rq = cpu_rq(env->src_cpu); | 1151 | struct rq *src_rq = cpu_rq(env->src_cpu); |
1137 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | 1152 | struct rq *dst_rq = cpu_rq(env->dst_cpu); |
1138 | struct task_struct *cur; | 1153 | struct task_struct *cur; |
1139 | long orig_src_load, src_load; | 1154 | long src_load, dst_load; |
1140 | long orig_dst_load, dst_load; | ||
1141 | long load; | 1155 | long load; |
1142 | long imp = (groupimp > 0) ? groupimp : taskimp; | 1156 | long imp = env->p->numa_group ? groupimp : taskimp; |
1157 | long moveimp = imp; | ||
1143 | 1158 | ||
1144 | rcu_read_lock(); | 1159 | rcu_read_lock(); |
1145 | cur = ACCESS_ONCE(dst_rq->curr); | 1160 | cur = ACCESS_ONCE(dst_rq->curr); |
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1177 | * itself (not part of a group), use the task weight | 1192 | * itself (not part of a group), use the task weight |
1178 | * instead. | 1193 | * instead. |
1179 | */ | 1194 | */ |
1180 | if (env->p->numa_group) | ||
1181 | imp = groupimp; | ||
1182 | else | ||
1183 | imp = taskimp; | ||
1184 | |||
1185 | if (cur->numa_group) | 1195 | if (cur->numa_group) |
1186 | imp += group_weight(cur, env->src_nid) - | 1196 | imp += group_weight(cur, env->src_nid) - |
1187 | group_weight(cur, env->dst_nid); | 1197 | group_weight(cur, env->dst_nid); |
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1191 | } | 1201 | } |
1192 | } | 1202 | } |
1193 | 1203 | ||
1194 | if (imp < env->best_imp) | 1204 | if (imp <= env->best_imp && moveimp <= env->best_imp) |
1195 | goto unlock; | 1205 | goto unlock; |
1196 | 1206 | ||
1197 | if (!cur) { | 1207 | if (!cur) { |
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1204 | } | 1214 | } |
1205 | 1215 | ||
1206 | /* Balance doesn't matter much if we're running a task per cpu */ | 1216 | /* Balance doesn't matter much if we're running a task per cpu */ |
1207 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | 1217 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
1218 | dst_rq->nr_running == 1) | ||
1208 | goto assign; | 1219 | goto assign; |
1209 | 1220 | ||
1210 | /* | 1221 | /* |
1211 | * In the overloaded case, try and keep the load balanced. | 1222 | * In the overloaded case, try and keep the load balanced. |
1212 | */ | 1223 | */ |
1213 | balance: | 1224 | balance: |
1214 | orig_dst_load = env->dst_stats.load; | ||
1215 | orig_src_load = env->src_stats.load; | ||
1216 | |||
1217 | /* XXX missing capacity terms */ | ||
1218 | load = task_h_load(env->p); | 1225 | load = task_h_load(env->p); |
1219 | dst_load = orig_dst_load + load; | 1226 | dst_load = env->dst_stats.load + load; |
1220 | src_load = orig_src_load - load; | 1227 | src_load = env->src_stats.load - load; |
1228 | |||
1229 | if (moveimp > imp && moveimp > env->best_imp) { | ||
1230 | /* | ||
1231 | * If the improvement from just moving env->p direction is | ||
1232 | * better than swapping tasks around, check if a move is | ||
1233 | * possible. Store a slightly smaller score than moveimp, | ||
1234 | * so an actually idle CPU will win. | ||
1235 | */ | ||
1236 | if (!load_too_imbalanced(src_load, dst_load, env)) { | ||
1237 | imp = moveimp - 1; | ||
1238 | cur = NULL; | ||
1239 | goto assign; | ||
1240 | } | ||
1241 | } | ||
1242 | |||
1243 | if (imp <= env->best_imp) | ||
1244 | goto unlock; | ||
1221 | 1245 | ||
1222 | if (cur) { | 1246 | if (cur) { |
1223 | load = task_h_load(cur); | 1247 | load = task_h_load(cur); |
@@ -1225,8 +1249,7 @@ balance: | |||
1225 | src_load += load; | 1249 | src_load += load; |
1226 | } | 1250 | } |
1227 | 1251 | ||
1228 | if (load_too_imbalanced(orig_src_load, orig_dst_load, | 1252 | if (load_too_imbalanced(src_load, dst_load, env)) |
1229 | src_load, dst_load, env)) | ||
1230 | goto unlock; | 1253 | goto unlock; |
1231 | 1254 | ||
1232 | assign: | 1255 | assign: |
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1302 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1325 | groupimp = group_weight(p, env.dst_nid) - groupweight; |
1303 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1326 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1304 | 1327 | ||
1305 | /* If the preferred nid has free capacity, try to use it. */ | 1328 | /* Try to find a spot on the preferred nid. */ |
1306 | if (env.dst_stats.has_free_capacity) | 1329 | task_numa_find_cpu(&env, taskimp, groupimp); |
1307 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1308 | 1330 | ||
1309 | /* No space available on the preferred nid. Look elsewhere. */ | 1331 | /* No space available on the preferred nid. Look elsewhere. */ |
1310 | if (env.best_cpu == -1) { | 1332 | if (env.best_cpu == -1) { |
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p) | |||
1324 | } | 1346 | } |
1325 | } | 1347 | } |
1326 | 1348 | ||
1327 | /* No better CPU than the current one was found. */ | ||
1328 | if (env.best_cpu == -1) | ||
1329 | return -EAGAIN; | ||
1330 | |||
1331 | /* | 1349 | /* |
1332 | * If the task is part of a workload that spans multiple NUMA nodes, | 1350 | * If the task is part of a workload that spans multiple NUMA nodes, |
1333 | * and is migrating into one of the workload's active nodes, remember | 1351 | * and is migrating into one of the workload's active nodes, remember |
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p) | |||
1336 | * A task that migrated to a second choice node will be better off | 1354 | * A task that migrated to a second choice node will be better off |
1337 | * trying for a better one later. Do not set the preferred node here. | 1355 | * trying for a better one later. Do not set the preferred node here. |
1338 | */ | 1356 | */ |
1339 | if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) | 1357 | if (p->numa_group) { |
1340 | sched_setnuma(p, env.dst_nid); | 1358 | if (env.best_cpu == -1) |
1359 | nid = env.src_nid; | ||
1360 | else | ||
1361 | nid = env.dst_nid; | ||
1362 | |||
1363 | if (node_isset(nid, p->numa_group->active_nodes)) | ||
1364 | sched_setnuma(p, env.dst_nid); | ||
1365 | } | ||
1366 | |||
1367 | /* No better CPU than the current one was found. */ | ||
1368 | if (env.best_cpu == -1) | ||
1369 | return -EAGAIN; | ||
1341 | 1370 | ||
1342 | /* | 1371 | /* |
1343 | * Reset the scan period if the task is being rescheduled on an | 1372 | * Reset the scan period if the task is being rescheduled on an |
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) | |||
1415 | /* | 1444 | /* |
1416 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | 1445 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS |
1417 | * increments. The more local the fault statistics are, the higher the scan | 1446 | * increments. The more local the fault statistics are, the higher the scan |
1418 | * period will be for the next scan window. If local/remote ratio is below | 1447 | * period will be for the next scan window. If local/(local+remote) ratio is |
1419 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | 1448 | * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) |
1420 | * scan period will decrease | 1449 | * the scan period will decrease. Aim for 70% local accesses. |
1421 | */ | 1450 | */ |
1422 | #define NUMA_PERIOD_SLOTS 10 | 1451 | #define NUMA_PERIOD_SLOTS 10 |
1423 | #define NUMA_PERIOD_THRESHOLD 3 | 1452 | #define NUMA_PERIOD_THRESHOLD 7 |
1424 | 1453 | ||
1425 | /* | 1454 | /* |
1426 | * Increase the scan period (slow down scanning) if the majority of | 1455 | * Increase the scan period (slow down scanning) if the majority of |
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p) | |||
1595 | 1624 | ||
1596 | if (p->numa_group) { | 1625 | if (p->numa_group) { |
1597 | update_numa_active_node_mask(p->numa_group); | 1626 | update_numa_active_node_mask(p->numa_group); |
1598 | /* | ||
1599 | * If the preferred task and group nids are different, | ||
1600 | * iterate over the nodes again to find the best place. | ||
1601 | */ | ||
1602 | if (max_nid != max_group_nid) { | ||
1603 | unsigned long weight, max_weight = 0; | ||
1604 | |||
1605 | for_each_online_node(nid) { | ||
1606 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
1607 | if (weight > max_weight) { | ||
1608 | max_weight = weight; | ||
1609 | max_nid = nid; | ||
1610 | } | ||
1611 | } | ||
1612 | } | ||
1613 | |||
1614 | spin_unlock_irq(group_lock); | 1627 | spin_unlock_irq(group_lock); |
1628 | max_nid = max_group_nid; | ||
1615 | } | 1629 | } |
1616 | 1630 | ||
1617 | /* Preferred node as the node with the most faults */ | 1631 | if (max_faults) { |
1618 | if (max_faults && max_nid != p->numa_preferred_nid) { | 1632 | /* Set the new preferred node */ |
1619 | /* Update the preferred nid and migrate task if possible */ | 1633 | if (max_nid != p->numa_preferred_nid) |
1620 | sched_setnuma(p, max_nid); | 1634 | sched_setnuma(p, max_nid); |
1621 | numa_migrate_preferred(p); | 1635 | |
1636 | if (task_node(p) != p->numa_preferred_nid) | ||
1637 | numa_migrate_preferred(p); | ||
1622 | } | 1638 | } |
1623 | } | 1639 | } |
1624 | 1640 | ||
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
2899 | ideal_runtime = sched_slice(cfs_rq, curr); | 2915 | ideal_runtime = sched_slice(cfs_rq, curr); |
2900 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 2916 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
2901 | if (delta_exec > ideal_runtime) { | 2917 | if (delta_exec > ideal_runtime) { |
2902 | resched_task(rq_of(cfs_rq)->curr); | 2918 | resched_curr(rq_of(cfs_rq)); |
2903 | /* | 2919 | /* |
2904 | * The current task ran long enough, ensure it doesn't get | 2920 | * The current task ran long enough, ensure it doesn't get |
2905 | * re-elected due to buddy favours. | 2921 | * re-elected due to buddy favours. |
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
2923 | return; | 2939 | return; |
2924 | 2940 | ||
2925 | if (delta > ideal_runtime) | 2941 | if (delta > ideal_runtime) |
2926 | resched_task(rq_of(cfs_rq)->curr); | 2942 | resched_curr(rq_of(cfs_rq)); |
2927 | } | 2943 | } |
2928 | 2944 | ||
2929 | static void | 2945 | static void |
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3063 | * validating it and just reschedule. | 3079 | * validating it and just reschedule. |
3064 | */ | 3080 | */ |
3065 | if (queued) { | 3081 | if (queued) { |
3066 | resched_task(rq_of(cfs_rq)->curr); | 3082 | resched_curr(rq_of(cfs_rq)); |
3067 | return; | 3083 | return; |
3068 | } | 3084 | } |
3069 | /* | 3085 | /* |
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) | |||
3254 | * hierarchy can be throttled | 3270 | * hierarchy can be throttled |
3255 | */ | 3271 | */ |
3256 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | 3272 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) |
3257 | resched_task(rq_of(cfs_rq)->curr); | 3273 | resched_curr(rq_of(cfs_rq)); |
3258 | } | 3274 | } |
3259 | 3275 | ||
3260 | static __always_inline | 3276 | static __always_inline |
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3360 | cfs_rq->throttled = 1; | 3376 | cfs_rq->throttled = 1; |
3361 | cfs_rq->throttled_clock = rq_clock(rq); | 3377 | cfs_rq->throttled_clock = rq_clock(rq); |
3362 | raw_spin_lock(&cfs_b->lock); | 3378 | raw_spin_lock(&cfs_b->lock); |
3363 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3379 | /* |
3380 | * Add to the _head_ of the list, so that an already-started | ||
3381 | * distribute_cfs_runtime will not see us | ||
3382 | */ | ||
3383 | list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
3364 | if (!cfs_b->timer_active) | 3384 | if (!cfs_b->timer_active) |
3365 | __start_cfs_bandwidth(cfs_b, false); | 3385 | __start_cfs_bandwidth(cfs_b, false); |
3366 | raw_spin_unlock(&cfs_b->lock); | 3386 | raw_spin_unlock(&cfs_b->lock); |
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3410 | 3430 | ||
3411 | /* determine whether we need to wake up potentially idle cpu */ | 3431 | /* determine whether we need to wake up potentially idle cpu */ |
3412 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 3432 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
3413 | resched_task(rq->curr); | 3433 | resched_curr(rq); |
3414 | } | 3434 | } |
3415 | 3435 | ||
3416 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | 3436 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, |
3417 | u64 remaining, u64 expires) | 3437 | u64 remaining, u64 expires) |
3418 | { | 3438 | { |
3419 | struct cfs_rq *cfs_rq; | 3439 | struct cfs_rq *cfs_rq; |
3420 | u64 runtime = remaining; | 3440 | u64 runtime; |
3441 | u64 starting_runtime = remaining; | ||
3421 | 3442 | ||
3422 | rcu_read_lock(); | 3443 | rcu_read_lock(); |
3423 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | 3444 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, |
@@ -3448,7 +3469,7 @@ next: | |||
3448 | } | 3469 | } |
3449 | rcu_read_unlock(); | 3470 | rcu_read_unlock(); |
3450 | 3471 | ||
3451 | return remaining; | 3472 | return starting_runtime - remaining; |
3452 | } | 3473 | } |
3453 | 3474 | ||
3454 | /* | 3475 | /* |
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3494 | /* account preceding periods in which throttling occurred */ | 3515 | /* account preceding periods in which throttling occurred */ |
3495 | cfs_b->nr_throttled += overrun; | 3516 | cfs_b->nr_throttled += overrun; |
3496 | 3517 | ||
3497 | /* | ||
3498 | * There are throttled entities so we must first use the new bandwidth | ||
3499 | * to unthrottle them before making it generally available. This | ||
3500 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
3501 | * allowed to run. | ||
3502 | */ | ||
3503 | runtime = cfs_b->runtime; | ||
3504 | runtime_expires = cfs_b->runtime_expires; | 3518 | runtime_expires = cfs_b->runtime_expires; |
3505 | cfs_b->runtime = 0; | ||
3506 | 3519 | ||
3507 | /* | 3520 | /* |
3508 | * This check is repeated as we are holding onto the new bandwidth | 3521 | * This check is repeated as we are holding onto the new bandwidth while |
3509 | * while we unthrottle. This can potentially race with an unthrottled | 3522 | * we unthrottle. This can potentially race with an unthrottled group |
3510 | * group trying to acquire new bandwidth from the global pool. | 3523 | * trying to acquire new bandwidth from the global pool. This can result |
3524 | * in us over-using our runtime if it is all used during this loop, but | ||
3525 | * only by limited amounts in that extreme case. | ||
3511 | */ | 3526 | */ |
3512 | while (throttled && runtime > 0) { | 3527 | while (throttled && cfs_b->runtime > 0) { |
3528 | runtime = cfs_b->runtime; | ||
3513 | raw_spin_unlock(&cfs_b->lock); | 3529 | raw_spin_unlock(&cfs_b->lock); |
3514 | /* we can't nest cfs_b->lock while distributing bandwidth */ | 3530 | /* we can't nest cfs_b->lock while distributing bandwidth */ |
3515 | runtime = distribute_cfs_runtime(cfs_b, runtime, | 3531 | runtime = distribute_cfs_runtime(cfs_b, runtime, |
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3517 | raw_spin_lock(&cfs_b->lock); | 3533 | raw_spin_lock(&cfs_b->lock); |
3518 | 3534 | ||
3519 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | 3535 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); |
3536 | |||
3537 | cfs_b->runtime -= min(runtime, cfs_b->runtime); | ||
3520 | } | 3538 | } |
3521 | 3539 | ||
3522 | /* return (any) remaining runtime */ | ||
3523 | cfs_b->runtime = runtime; | ||
3524 | /* | 3540 | /* |
3525 | * While we are ensured activity in the period following an | 3541 | * While we are ensured activity in the period following an |
3526 | * unthrottle, this also covers the case in which the new bandwidth is | 3542 | * unthrottle, this also covers the case in which the new bandwidth is |
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
3631 | return; | 3647 | return; |
3632 | } | 3648 | } |
3633 | 3649 | ||
3634 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3650 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) |
3635 | runtime = cfs_b->runtime; | 3651 | runtime = cfs_b->runtime; |
3636 | cfs_b->runtime = 0; | 3652 | |
3637 | } | ||
3638 | expires = cfs_b->runtime_expires; | 3653 | expires = cfs_b->runtime_expires; |
3639 | raw_spin_unlock(&cfs_b->lock); | 3654 | raw_spin_unlock(&cfs_b->lock); |
3640 | 3655 | ||
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
3645 | 3660 | ||
3646 | raw_spin_lock(&cfs_b->lock); | 3661 | raw_spin_lock(&cfs_b->lock); |
3647 | if (expires == cfs_b->runtime_expires) | 3662 | if (expires == cfs_b->runtime_expires) |
3648 | cfs_b->runtime = runtime; | 3663 | cfs_b->runtime -= min(runtime, cfs_b->runtime); |
3649 | raw_spin_unlock(&cfs_b->lock); | 3664 | raw_spin_unlock(&cfs_b->lock); |
3650 | } | 3665 | } |
3651 | 3666 | ||
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
3775 | hrtimer_cancel(&cfs_b->slack_timer); | 3790 | hrtimer_cancel(&cfs_b->slack_timer); |
3776 | } | 3791 | } |
3777 | 3792 | ||
3793 | static void __maybe_unused update_runtime_enabled(struct rq *rq) | ||
3794 | { | ||
3795 | struct cfs_rq *cfs_rq; | ||
3796 | |||
3797 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
3798 | struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; | ||
3799 | |||
3800 | raw_spin_lock(&cfs_b->lock); | ||
3801 | cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; | ||
3802 | raw_spin_unlock(&cfs_b->lock); | ||
3803 | } | ||
3804 | } | ||
3805 | |||
3778 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | 3806 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) |
3779 | { | 3807 | { |
3780 | struct cfs_rq *cfs_rq; | 3808 | struct cfs_rq *cfs_rq; |
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3788 | * there's some valid quota amount | 3816 | * there's some valid quota amount |
3789 | */ | 3817 | */ |
3790 | cfs_rq->runtime_remaining = 1; | 3818 | cfs_rq->runtime_remaining = 1; |
3819 | /* | ||
3820 | * Offline rq is schedulable till cpu is completely disabled | ||
3821 | * in take_cpu_down(), so we prevent new cfs throttling here. | ||
3822 | */ | ||
3823 | cfs_rq->runtime_enabled = 0; | ||
3824 | |||
3791 | if (cfs_rq_throttled(cfs_rq)) | 3825 | if (cfs_rq_throttled(cfs_rq)) |
3792 | unthrottle_cfs_rq(cfs_rq); | 3826 | unthrottle_cfs_rq(cfs_rq); |
3793 | } | 3827 | } |
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
3831 | return NULL; | 3865 | return NULL; |
3832 | } | 3866 | } |
3833 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | 3867 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} |
3868 | static inline void update_runtime_enabled(struct rq *rq) {} | ||
3834 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} | 3869 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} |
3835 | 3870 | ||
3836 | #endif /* CONFIG_CFS_BANDWIDTH */ | 3871 | #endif /* CONFIG_CFS_BANDWIDTH */ |
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
3854 | 3889 | ||
3855 | if (delta < 0) { | 3890 | if (delta < 0) { |
3856 | if (rq->curr == p) | 3891 | if (rq->curr == p) |
3857 | resched_task(p); | 3892 | resched_curr(rq); |
3858 | return; | 3893 | return; |
3859 | } | 3894 | } |
3860 | 3895 | ||
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
4723 | return; | 4758 | return; |
4724 | 4759 | ||
4725 | preempt: | 4760 | preempt: |
4726 | resched_task(curr); | 4761 | resched_curr(rq); |
4727 | /* | 4762 | /* |
4728 | * Only set the backward buddy when the current task is still | 4763 | * Only set the backward buddy when the current task is still |
4729 | * on the rq. This can happen when a wakeup gets interleaved | 4764 | * on the rq. This can happen when a wakeup gets interleaved |
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) | |||
5094 | /* | 5129 | /* |
5095 | * Is this task likely cache-hot: | 5130 | * Is this task likely cache-hot: |
5096 | */ | 5131 | */ |
5097 | static int | 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) |
5098 | task_hot(struct task_struct *p, u64 now) | ||
5099 | { | 5133 | { |
5100 | s64 delta; | 5134 | s64 delta; |
5101 | 5135 | ||
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now) | |||
5108 | /* | 5142 | /* |
5109 | * Buddy candidates are cache hot: | 5143 | * Buddy candidates are cache hot: |
5110 | */ | 5144 | */ |
5111 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | 5145 | if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && |
5112 | (&p->se == cfs_rq_of(&p->se)->next || | 5146 | (&p->se == cfs_rq_of(&p->se)->next || |
5113 | &p->se == cfs_rq_of(&p->se)->last)) | 5147 | &p->se == cfs_rq_of(&p->se)->last)) |
5114 | return 1; | 5148 | return 1; |
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now) | |||
5118 | if (sysctl_sched_migration_cost == 0) | 5152 | if (sysctl_sched_migration_cost == 0) |
5119 | return 0; | 5153 | return 0; |
5120 | 5154 | ||
5121 | delta = now - p->se.exec_start; | 5155 | delta = rq_clock_task(env->src_rq) - p->se.exec_start; |
5122 | 5156 | ||
5123 | return delta < (s64)sysctl_sched_migration_cost; | 5157 | return delta < (s64)sysctl_sched_migration_cost; |
5124 | } | 5158 | } |
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5272 | * 2) task is cache cold, or | 5306 | * 2) task is cache cold, or |
5273 | * 3) too many balance attempts have failed. | 5307 | * 3) too many balance attempts have failed. |
5274 | */ | 5308 | */ |
5275 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); | 5309 | tsk_cache_hot = task_hot(p, env); |
5276 | if (!tsk_cache_hot) | 5310 | if (!tsk_cache_hot) |
5277 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5311 | tsk_cache_hot = migrate_degrades_locality(p, env); |
5278 | 5312 | ||
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
5864 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 5898 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
5865 | * @local_group: Does group contain this_cpu. | 5899 | * @local_group: Does group contain this_cpu. |
5866 | * @sgs: variable to hold the statistics for this group. | 5900 | * @sgs: variable to hold the statistics for this group. |
5901 | * @overload: Indicate more than one runnable task for any CPU. | ||
5867 | */ | 5902 | */ |
5868 | static inline void update_sg_lb_stats(struct lb_env *env, | 5903 | static inline void update_sg_lb_stats(struct lb_env *env, |
5869 | struct sched_group *group, int load_idx, | 5904 | struct sched_group *group, int load_idx, |
5870 | int local_group, struct sg_lb_stats *sgs) | 5905 | int local_group, struct sg_lb_stats *sgs, |
5906 | bool *overload) | ||
5871 | { | 5907 | { |
5872 | unsigned long load; | 5908 | unsigned long load; |
5873 | int i; | 5909 | int i; |
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5885 | 5921 | ||
5886 | sgs->group_load += load; | 5922 | sgs->group_load += load; |
5887 | sgs->sum_nr_running += rq->nr_running; | 5923 | sgs->sum_nr_running += rq->nr_running; |
5924 | |||
5925 | if (rq->nr_running > 1) | ||
5926 | *overload = true; | ||
5927 | |||
5888 | #ifdef CONFIG_NUMA_BALANCING | 5928 | #ifdef CONFIG_NUMA_BALANCING |
5889 | sgs->nr_numa_running += rq->nr_numa_running; | 5929 | sgs->nr_numa_running += rq->nr_numa_running; |
5890 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5930 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
5995 | struct sched_group *sg = env->sd->groups; | 6035 | struct sched_group *sg = env->sd->groups; |
5996 | struct sg_lb_stats tmp_sgs; | 6036 | struct sg_lb_stats tmp_sgs; |
5997 | int load_idx, prefer_sibling = 0; | 6037 | int load_idx, prefer_sibling = 0; |
6038 | bool overload = false; | ||
5998 | 6039 | ||
5999 | if (child && child->flags & SD_PREFER_SIBLING) | 6040 | if (child && child->flags & SD_PREFER_SIBLING) |
6000 | prefer_sibling = 1; | 6041 | prefer_sibling = 1; |
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6015 | update_group_capacity(env->sd, env->dst_cpu); | 6056 | update_group_capacity(env->sd, env->dst_cpu); |
6016 | } | 6057 | } |
6017 | 6058 | ||
6018 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 6059 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs, |
6060 | &overload); | ||
6019 | 6061 | ||
6020 | if (local_group) | 6062 | if (local_group) |
6021 | goto next_group; | 6063 | goto next_group; |
@@ -6049,6 +6091,13 @@ next_group: | |||
6049 | 6091 | ||
6050 | if (env->sd->flags & SD_NUMA) | 6092 | if (env->sd->flags & SD_NUMA) |
6051 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | 6093 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); |
6094 | |||
6095 | if (!env->sd->parent) { | ||
6096 | /* update overload indicator if we are at root domain */ | ||
6097 | if (env->dst_rq->rd->overload != overload) | ||
6098 | env->dst_rq->rd->overload = overload; | ||
6099 | } | ||
6100 | |||
6052 | } | 6101 | } |
6053 | 6102 | ||
6054 | /** | 6103 | /** |
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq) | |||
6767 | */ | 6816 | */ |
6768 | this_rq->idle_stamp = rq_clock(this_rq); | 6817 | this_rq->idle_stamp = rq_clock(this_rq); |
6769 | 6818 | ||
6770 | if (this_rq->avg_idle < sysctl_sched_migration_cost) { | 6819 | if (this_rq->avg_idle < sysctl_sched_migration_cost || |
6820 | !this_rq->rd->overload) { | ||
6771 | rcu_read_lock(); | 6821 | rcu_read_lock(); |
6772 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 6822 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
6773 | if (sd) | 6823 | if (sd) |
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq) | |||
7325 | static void rq_online_fair(struct rq *rq) | 7375 | static void rq_online_fair(struct rq *rq) |
7326 | { | 7376 | { |
7327 | update_sysctl(); | 7377 | update_sysctl(); |
7378 | |||
7379 | update_runtime_enabled(rq); | ||
7328 | } | 7380 | } |
7329 | 7381 | ||
7330 | static void rq_offline_fair(struct rq *rq) | 7382 | static void rq_offline_fair(struct rq *rq) |
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p) | |||
7398 | * 'current' within the tree based on its new key value. | 7450 | * 'current' within the tree based on its new key value. |
7399 | */ | 7451 | */ |
7400 | swap(curr->vruntime, se->vruntime); | 7452 | swap(curr->vruntime, se->vruntime); |
7401 | resched_task(rq->curr); | 7453 | resched_curr(rq); |
7402 | } | 7454 | } |
7403 | 7455 | ||
7404 | se->vruntime -= cfs_rq->min_vruntime; | 7456 | se->vruntime -= cfs_rq->min_vruntime; |
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | |||
7423 | */ | 7475 | */ |
7424 | if (rq->curr == p) { | 7476 | if (rq->curr == p) { |
7425 | if (p->prio > oldprio) | 7477 | if (p->prio > oldprio) |
7426 | resched_task(rq->curr); | 7478 | resched_curr(rq); |
7427 | } else | 7479 | } else |
7428 | check_preempt_curr(rq, p, 0); | 7480 | check_preempt_curr(rq, p, 0); |
7429 | } | 7481 | } |
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) | |||
7486 | * if we can still preempt the current task. | 7538 | * if we can still preempt the current task. |
7487 | */ | 7539 | */ |
7488 | if (rq->curr == p) | 7540 | if (rq->curr == p) |
7489 | resched_task(rq->curr); | 7541 | resched_curr(rq); |
7490 | else | 7542 | else |
7491 | check_preempt_curr(rq, p, 0); | 7543 | check_preempt_curr(rq, p, 0); |
7492 | } | 7544 | } |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..9f1608f99819 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void) | |||
79 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | 79 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); |
80 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | 80 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); |
81 | int next_state, entered_state; | 81 | int next_state, entered_state; |
82 | bool broadcast; | 82 | unsigned int broadcast; |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * Check if the idle task must be rescheduled. If it is the | 85 | * Check if the idle task must be rescheduled. If it is the |
@@ -135,7 +135,7 @@ use_default: | |||
135 | goto exit_idle; | 135 | goto exit_idle; |
136 | } | 136 | } |
137 | 137 | ||
138 | broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); | 138 | broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * Tell the time framework to switch to a broadcast timer | 141 | * Tell the time framework to switch to a broadcast timer |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..67ad4e7f506a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
20 | */ | 20 | */ |
21 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | 21 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) |
22 | { | 22 | { |
23 | resched_task(rq->idle); | 23 | resched_curr(rq); |
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a49083192c64..5f6edca4fafd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
463 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 463 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
464 | { | 464 | { |
465 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 465 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
466 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
466 | struct sched_rt_entity *rt_se; | 467 | struct sched_rt_entity *rt_se; |
467 | 468 | ||
468 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); | 469 | int cpu = cpu_of(rq); |
469 | 470 | ||
470 | rt_se = rt_rq->tg->rt_se[cpu]; | 471 | rt_se = rt_rq->tg->rt_se[cpu]; |
471 | 472 | ||
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
476 | enqueue_rt_entity(rt_se, false); | 477 | enqueue_rt_entity(rt_se, false); |
477 | 478 | ||
478 | if (rt_rq->highest_prio.curr < curr->prio) | 479 | if (rt_rq->highest_prio.curr < curr->prio) |
479 | resched_task(curr); | 480 | resched_curr(rq); |
480 | } | 481 | } |
481 | } | 482 | } |
482 | 483 | ||
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
566 | return; | 567 | return; |
567 | 568 | ||
568 | enqueue_top_rt_rq(rt_rq); | 569 | enqueue_top_rt_rq(rt_rq); |
569 | resched_task(rq->curr); | 570 | resched_curr(rq); |
570 | } | 571 | } |
571 | 572 | ||
572 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 573 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
@@ -740,6 +741,9 @@ balanced: | |||
740 | rt_rq->rt_throttled = 0; | 741 | rt_rq->rt_throttled = 0; |
741 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 742 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
742 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 743 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
744 | |||
745 | /* Make rt_rq available for pick_next_task() */ | ||
746 | sched_rt_rq_enqueue(rt_rq); | ||
743 | } | 747 | } |
744 | } | 748 | } |
745 | 749 | ||
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq) | |||
948 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 952 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
949 | rt_rq->rt_time += delta_exec; | 953 | rt_rq->rt_time += delta_exec; |
950 | if (sched_rt_runtime_exceeded(rt_rq)) | 954 | if (sched_rt_runtime_exceeded(rt_rq)) |
951 | resched_task(curr); | 955 | resched_curr(rq); |
952 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 956 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
953 | } | 957 | } |
954 | } | 958 | } |
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1363 | * to try and push current away: | 1367 | * to try and push current away: |
1364 | */ | 1368 | */ |
1365 | requeue_task_rt(rq, p, 1); | 1369 | requeue_task_rt(rq, p, 1); |
1366 | resched_task(rq->curr); | 1370 | resched_curr(rq); |
1367 | } | 1371 | } |
1368 | 1372 | ||
1369 | #endif /* CONFIG_SMP */ | 1373 | #endif /* CONFIG_SMP */ |
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1374 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) | 1378 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) |
1375 | { | 1379 | { |
1376 | if (p->prio < rq->curr->prio) { | 1380 | if (p->prio < rq->curr->prio) { |
1377 | resched_task(rq->curr); | 1381 | resched_curr(rq); |
1378 | return; | 1382 | return; |
1379 | } | 1383 | } |
1380 | 1384 | ||
@@ -1690,7 +1694,7 @@ retry: | |||
1690 | * just reschedule current. | 1694 | * just reschedule current. |
1691 | */ | 1695 | */ |
1692 | if (unlikely(next_task->prio < rq->curr->prio)) { | 1696 | if (unlikely(next_task->prio < rq->curr->prio)) { |
1693 | resched_task(rq->curr); | 1697 | resched_curr(rq); |
1694 | return 0; | 1698 | return 0; |
1695 | } | 1699 | } |
1696 | 1700 | ||
@@ -1737,7 +1741,7 @@ retry: | |||
1737 | activate_task(lowest_rq, next_task, 0); | 1741 | activate_task(lowest_rq, next_task, 0); |
1738 | ret = 1; | 1742 | ret = 1; |
1739 | 1743 | ||
1740 | resched_task(lowest_rq->curr); | 1744 | resched_curr(lowest_rq); |
1741 | 1745 | ||
1742 | double_unlock_balance(rq, lowest_rq); | 1746 | double_unlock_balance(rq, lowest_rq); |
1743 | 1747 | ||
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1936 | return; | 1940 | return; |
1937 | 1941 | ||
1938 | if (pull_rt_task(rq)) | 1942 | if (pull_rt_task(rq)) |
1939 | resched_task(rq->curr); | 1943 | resched_curr(rq); |
1940 | } | 1944 | } |
1941 | 1945 | ||
1942 | void __init init_sched_rt_class(void) | 1946 | void __init init_sched_rt_class(void) |
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1974 | check_resched = 0; | 1978 | check_resched = 0; |
1975 | #endif /* CONFIG_SMP */ | 1979 | #endif /* CONFIG_SMP */ |
1976 | if (check_resched && p->prio < rq->curr->prio) | 1980 | if (check_resched && p->prio < rq->curr->prio) |
1977 | resched_task(rq->curr); | 1981 | resched_curr(rq); |
1978 | } | 1982 | } |
1979 | } | 1983 | } |
1980 | 1984 | ||
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2003 | * Only reschedule if p is still on the same runqueue. | 2007 | * Only reschedule if p is still on the same runqueue. |
2004 | */ | 2008 | */ |
2005 | if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) | 2009 | if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) |
2006 | resched_task(p); | 2010 | resched_curr(rq); |
2007 | #else | 2011 | #else |
2008 | /* For UP simply resched on drop of prio */ | 2012 | /* For UP simply resched on drop of prio */ |
2009 | if (oldprio < p->prio) | 2013 | if (oldprio < p->prio) |
2010 | resched_task(p); | 2014 | resched_curr(rq); |
2011 | #endif /* CONFIG_SMP */ | 2015 | #endif /* CONFIG_SMP */ |
2012 | } else { | 2016 | } else { |
2013 | /* | 2017 | /* |
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2016 | * then reschedule. | 2020 | * then reschedule. |
2017 | */ | 2021 | */ |
2018 | if (p->prio < rq->curr->prio) | 2022 | if (p->prio < rq->curr->prio) |
2019 | resched_task(rq->curr); | 2023 | resched_curr(rq); |
2020 | } | 2024 | } |
2021 | } | 2025 | } |
2022 | 2026 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..579712f4e9d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -477,6 +477,9 @@ struct root_domain { | |||
477 | cpumask_var_t span; | 477 | cpumask_var_t span; |
478 | cpumask_var_t online; | 478 | cpumask_var_t online; |
479 | 479 | ||
480 | /* Indicate more than one runnable task for any CPU */ | ||
481 | bool overload; | ||
482 | |||
480 | /* | 483 | /* |
481 | * The bit corresponding to a CPU gets set here if such CPU has more | 484 | * The bit corresponding to a CPU gets set here if such CPU has more |
482 | * than one runnable -deadline task (as it is below for RT tasks). | 485 | * than one runnable -deadline task (as it is below for RT tasks). |
@@ -884,20 +887,10 @@ enum { | |||
884 | #undef SCHED_FEAT | 887 | #undef SCHED_FEAT |
885 | 888 | ||
886 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | 889 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) |
887 | static __always_inline bool static_branch__true(struct static_key *key) | ||
888 | { | ||
889 | return static_key_true(key); /* Not out of line branch. */ | ||
890 | } | ||
891 | |||
892 | static __always_inline bool static_branch__false(struct static_key *key) | ||
893 | { | ||
894 | return static_key_false(key); /* Out of line branch. */ | ||
895 | } | ||
896 | |||
897 | #define SCHED_FEAT(name, enabled) \ | 890 | #define SCHED_FEAT(name, enabled) \ |
898 | static __always_inline bool static_branch_##name(struct static_key *key) \ | 891 | static __always_inline bool static_branch_##name(struct static_key *key) \ |
899 | { \ | 892 | { \ |
900 | return static_branch__##enabled(key); \ | 893 | return static_key_##enabled(key); \ |
901 | } | 894 | } |
902 | 895 | ||
903 | #include "features.h" | 896 | #include "features.h" |
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void); | |||
1196 | extern void init_sched_fair_class(void); | 1189 | extern void init_sched_fair_class(void); |
1197 | extern void init_sched_dl_class(void); | 1190 | extern void init_sched_dl_class(void); |
1198 | 1191 | ||
1199 | extern void resched_task(struct task_struct *p); | 1192 | extern void resched_curr(struct rq *rq); |
1200 | extern void resched_cpu(int cpu); | 1193 | extern void resched_cpu(int cpu); |
1201 | 1194 | ||
1202 | extern struct rt_bandwidth def_rt_bandwidth; | 1195 | extern struct rt_bandwidth def_rt_bandwidth; |
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count) | |||
1218 | 1211 | ||
1219 | rq->nr_running = prev_nr + count; | 1212 | rq->nr_running = prev_nr + count; |
1220 | 1213 | ||
1221 | #ifdef CONFIG_NO_HZ_FULL | ||
1222 | if (prev_nr < 2 && rq->nr_running >= 2) { | 1214 | if (prev_nr < 2 && rq->nr_running >= 2) { |
1215 | #ifdef CONFIG_SMP | ||
1216 | if (!rq->rd->overload) | ||
1217 | rq->rd->overload = true; | ||
1218 | #endif | ||
1219 | |||
1220 | #ifdef CONFIG_NO_HZ_FULL | ||
1223 | if (tick_nohz_full_cpu(rq->cpu)) { | 1221 | if (tick_nohz_full_cpu(rq->cpu)) { |
1224 | /* Order rq->nr_running write against the IPI */ | 1222 | /* |
1225 | smp_wmb(); | 1223 | * Tick is needed if more than one task runs on a CPU. |
1226 | smp_send_reschedule(rq->cpu); | 1224 | * Send the target an IPI to kick it out of nohz mode. |
1225 | * | ||
1226 | * We assume that IPI implies full memory barrier and the | ||
1227 | * new value of rq->nr_running is visible on reception | ||
1228 | * from the target. | ||
1229 | */ | ||
1230 | tick_nohz_full_kick_cpu(rq->cpu); | ||
1227 | } | 1231 | } |
1228 | } | ||
1229 | #endif | 1232 | #endif |
1233 | } | ||
1230 | } | 1234 | } |
1231 | 1235 | ||
1232 | static inline void sub_nr_running(struct rq *rq, unsigned count) | 1236 | static inline void sub_nr_running(struct rq *rq, unsigned count) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20ae657b..15cab1a4f84e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function); | |||
319 | */ | 319 | */ |
320 | int __sched | 320 | int __sched |
321 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | 321 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, |
322 | int (*action)(void *), unsigned mode) | 322 | wait_bit_action_f *action, unsigned mode) |
323 | { | 323 | { |
324 | int ret = 0; | 324 | int ret = 0; |
325 | 325 | ||
326 | do { | 326 | do { |
327 | prepare_to_wait(wq, &q->wait, mode); | 327 | prepare_to_wait(wq, &q->wait, mode); |
328 | if (test_bit(q->key.bit_nr, q->key.flags)) | 328 | if (test_bit(q->key.bit_nr, q->key.flags)) |
329 | ret = (*action)(q->key.flags); | 329 | ret = (*action)(&q->key); |
330 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); | 330 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); |
331 | finish_wait(wq, &q->wait); | 331 | finish_wait(wq, &q->wait); |
332 | return ret; | 332 | return ret; |
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
334 | EXPORT_SYMBOL(__wait_on_bit); | 334 | EXPORT_SYMBOL(__wait_on_bit); |
335 | 335 | ||
336 | int __sched out_of_line_wait_on_bit(void *word, int bit, | 336 | int __sched out_of_line_wait_on_bit(void *word, int bit, |
337 | int (*action)(void *), unsigned mode) | 337 | wait_bit_action_f *action, unsigned mode) |
338 | { | 338 | { |
339 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | 339 | wait_queue_head_t *wq = bit_waitqueue(word, bit); |
340 | DEFINE_WAIT_BIT(wait, word, bit); | 340 | DEFINE_WAIT_BIT(wait, word, bit); |
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit); | |||
345 | 345 | ||
346 | int __sched | 346 | int __sched |
347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
348 | int (*action)(void *), unsigned mode) | 348 | wait_bit_action_f *action, unsigned mode) |
349 | { | 349 | { |
350 | do { | 350 | do { |
351 | int ret; | 351 | int ret; |
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
353 | prepare_to_wait_exclusive(wq, &q->wait, mode); | 353 | prepare_to_wait_exclusive(wq, &q->wait, mode); |
354 | if (!test_bit(q->key.bit_nr, q->key.flags)) | 354 | if (!test_bit(q->key.bit_nr, q->key.flags)) |
355 | continue; | 355 | continue; |
356 | ret = action(q->key.flags); | 356 | ret = action(&q->key); |
357 | if (!ret) | 357 | if (!ret) |
358 | continue; | 358 | continue; |
359 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); | 359 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); |
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
365 | EXPORT_SYMBOL(__wait_on_bit_lock); | 365 | EXPORT_SYMBOL(__wait_on_bit_lock); |
366 | 366 | ||
367 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, | 367 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, |
368 | int (*action)(void *), unsigned mode) | 368 | wait_bit_action_f *action, unsigned mode) |
369 | { | 369 | { |
370 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | 370 | wait_queue_head_t *wq = bit_waitqueue(word, bit); |
371 | DEFINE_WAIT_BIT(wait, word, bit); | 371 | DEFINE_WAIT_BIT(wait, word, bit); |
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p) | |||
502 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | 502 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); |
503 | } | 503 | } |
504 | EXPORT_SYMBOL(wake_up_atomic_t); | 504 | EXPORT_SYMBOL(wake_up_atomic_t); |
505 | |||
506 | __sched int bit_wait(struct wait_bit_key *word) | ||
507 | { | ||
508 | if (signal_pending_state(current->state, current)) | ||
509 | return 1; | ||
510 | schedule(); | ||
511 | return 0; | ||
512 | } | ||
513 | EXPORT_SYMBOL(bit_wait); | ||
514 | |||
515 | __sched int bit_wait_io(struct wait_bit_key *word) | ||
516 | { | ||
517 | if (signal_pending_state(current->state, current)) | ||
518 | return 1; | ||
519 | io_schedule(); | ||
520 | return 0; | ||
521 | } | ||
522 | EXPORT_SYMBOL(bit_wait_io); | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 80c33f8de14f..487653b5844f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) Jens Axboe <jens.axboe@oracle.com> 2008 | 4 | * (C) Jens Axboe <jens.axboe@oracle.com> 2008 |
5 | */ | 5 | */ |
6 | #include <linux/irq_work.h> | ||
6 | #include <linux/rcupdate.h> | 7 | #include <linux/rcupdate.h> |
7 | #include <linux/rculist.h> | 8 | #include <linux/rculist.h> |
8 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
251 | csd->func(csd->info); | 252 | csd->func(csd->info); |
252 | csd_unlock(csd); | 253 | csd_unlock(csd); |
253 | } | 254 | } |
255 | |||
256 | /* | ||
257 | * Handle irq works queued remotely by irq_work_queue_on(). | ||
258 | * Smp functions above are typically synchronous so they | ||
259 | * better run first since some other CPUs may be busy waiting | ||
260 | * for them. | ||
261 | */ | ||
262 | irq_work_run(); | ||
254 | } | 263 | } |
255 | 264 | ||
256 | /* | 265 | /* |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f784d83e29f1..99aa6ee3908f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -225,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | |||
225 | }; | 225 | }; |
226 | 226 | ||
227 | /* | 227 | /* |
228 | * Kick the current CPU if it's full dynticks in order to force it to | 228 | * Kick the CPU if it's full dynticks in order to force it to |
229 | * re-evaluate its dependency on the tick and restart it if necessary. | 229 | * re-evaluate its dependency on the tick and restart it if necessary. |
230 | */ | 230 | */ |
231 | void tick_nohz_full_kick(void) | 231 | void tick_nohz_full_kick_cpu(int cpu) |
232 | { | 232 | { |
233 | if (tick_nohz_full_cpu(smp_processor_id())) | 233 | if (!tick_nohz_full_cpu(cpu)) |
234 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | 234 | return; |
235 | |||
236 | irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); | ||
235 | } | 237 | } |
236 | 238 | ||
237 | static void nohz_full_kick_ipi(void *info) | 239 | static void nohz_full_kick_ipi(void *info) |