aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-10-26 11:08:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-26 11:08:43 -0400
commit8a4a8918ed6e4a361f4df19f199bbc2d0a89a46c (patch)
treed76974986aaaa8549baf2d6a106fa6cb60d64b88
parent8686a0e200419322654a75155e2e6f80346a1297 (diff)
parent540f41edc15473ca3b2876de72646546ae101374 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits) llist: Add back llist_add_batch() and llist_del_first() prototypes sched: Don't use tasklist_lock for debug prints sched: Warn on rt throttling sched: Unify the ->cpus_allowed mask copy sched: Wrap scheduler p->cpus_allowed access sched: Request for idle balance during nohz idle load balance sched: Use resched IPI to kick off the nohz idle balance sched: Fix idle_cpu() llist: Remove cpu_relax() usage in cmpxchg loops sched: Convert to struct llist llist: Add llist_next() irq_work: Use llist in the struct irq_work logic llist: Return whether list is empty before adding in llist_add() llist: Move cpu_relax() to after the cmpxchg() llist: Remove the platform-dependent NMI checks llist: Make some llist functions inline sched, tracing: Show PREEMPT_ACTIVE state in trace_sched_switch sched: Remove redundant test in check_preempt_tick() sched: Add documentation for bandwidth control sched: Return unused runtime on group dequeue ...
-rw-r--r--Documentation/scheduler/sched-bwc.txt122
-rw-r--r--drivers/acpi/apei/Kconfig1
-rw-r--r--include/linux/irq_work.h15
-rw-r--r--include/linux/llist.h77
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/trace/events/sched.h9
-rw-r--r--init/Kconfig12
-rw-r--r--kernel/irq_work.c91
-rw-r--r--kernel/sched.c666
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c761
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c99
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile4
-rw-r--r--lib/llist.c74
-rw-r--r--lib/smp_processor_id.c2
20 files changed, 1646 insertions, 410 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 000000000000..f6b1873f68ab
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -0,0 +1,122 @@
1CFS Bandwidth Control
2=====================
3
4[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
5 The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
6
7CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
8specification of the maximum CPU bandwidth available to a group or hierarchy.
9
10The bandwidth allowed for a group is specified using a quota and period. Within
11each given "period" (microseconds), a group is allowed to consume only up to
12"quota" microseconds of CPU time. When the CPU bandwidth consumption of a
13group exceeds this limit (for that period), the tasks belonging to its
14hierarchy will be throttled and are not allowed to run again until the next
15period.
16
17A group's unused runtime is globally tracked, being refreshed with quota units
18above at each period boundary. As threads consume this bandwidth it is
19transferred to cpu-local "silos" on a demand basis. The amount transferred
20within each of these updates is tunable and described as the "slice".
21
22Management
23----------
24Quota and period are managed within the cpu subsystem via cgroupfs.
25
26cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
27cpu.cfs_period_us: the length of a period (in microseconds)
28cpu.stat: exports throttling statistics [explained further below]
29
30The default values are:
31 cpu.cfs_period_us=100ms
32 cpu.cfs_quota=-1
33
34A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
35bandwidth restriction in place, such a group is described as an unconstrained
36bandwidth group. This represents the traditional work-conserving behavior for
37CFS.
38
39Writing any (valid) positive value(s) will enact the specified bandwidth limit.
40The minimum quota allowed for the quota or period is 1ms. There is also an
41upper bound on the period length of 1s. Additional restrictions exist when
42bandwidth limits are used in a hierarchical fashion, these are explained in
43more detail below.
44
45Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
46and return the group to an unconstrained state once more.
47
48Any updates to a group's bandwidth specification will result in it becoming
49unthrottled if it is in a constrained state.
50
51System wide settings
52--------------------
53For efficiency run-time is transferred between the global pool and CPU local
54"silos" in a batch fashion. This greatly reduces global accounting pressure
55on large systems. The amount transferred each time such an update is required
56is described as the "slice".
57
58This is tunable via procfs:
59 /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
60
61Larger slice values will reduce transfer overheads, while smaller values allow
62for more fine-grained consumption.
63
64Statistics
65----------
66A group's bandwidth statistics are exported via 3 fields in cpu.stat.
67
68cpu.stat:
69- nr_periods: Number of enforcement intervals that have elapsed.
70- nr_throttled: Number of times the group has been throttled/limited.
71- throttled_time: The total time duration (in nanoseconds) for which entities
72 of the group have been throttled.
73
74This interface is read-only.
75
76Hierarchical considerations
77---------------------------
78The interface enforces that an individual entity's bandwidth is always
79attainable, that is: max(c_i) <= C. However, over-subscription in the
80aggregate case is explicitly allowed to enable work-conserving semantics
81within a hierarchy.
82 e.g. \Sum (c_i) may exceed C
83[ Where C is the parent's bandwidth, and c_i its children ]
84
85
86There are two ways in which a group may become throttled:
87 a. it fully consumes its own quota within a period
88 b. a parent's quota is fully consumed within its period
89
90In case b) above, even though the child may have runtime remaining it will not
91be allowed to until the parent's runtime is refreshed.
92
93Examples
94--------
951. Limit a group to 1 CPU worth of runtime.
96
97 If period is 250ms and quota is also 250ms, the group will get
98 1 CPU worth of runtime every 250ms.
99
100 # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
101 # echo 250000 > cpu.cfs_period_us /* period = 250ms */
102
1032. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
104
105 With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
106 runtime every 500ms.
107
108 # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
109 # echo 500000 > cpu.cfs_period_us /* period = 500ms */
110
111 The larger period here allows for increased burst capacity.
112
1133. Limit a group to 20% of 1 CPU.
114
115 With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
116
117 # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
118 # echo 50000 > cpu.cfs_period_us /* period = 50ms */
119
120 By using a small period here we are ensuring a consistent latency
121 response at the expense of burst capacity.
122
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index e3f47872ec22..f0c1ce95a0ec 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -14,7 +14,6 @@ config ACPI_APEI_GHES
14 depends on ACPI_APEI && X86 14 depends on ACPI_APEI && X86
15 select ACPI_HED 15 select ACPI_HED
16 select IRQ_WORK 16 select IRQ_WORK
17 select LLIST
18 select GENERIC_ALLOCATOR 17 select GENERIC_ALLOCATOR
19 help 18 help
20 Generic Hardware Error Source provides a way to report 19 Generic Hardware Error Source provides a way to report
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 4fa09d4d0b71..6a9e8f5399e2 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -1,20 +1,23 @@
1#ifndef _LINUX_IRQ_WORK_H 1#ifndef _LINUX_IRQ_WORK_H
2#define _LINUX_IRQ_WORK_H 2#define _LINUX_IRQ_WORK_H
3 3
4#include <linux/llist.h>
5
4struct irq_work { 6struct irq_work {
5 struct irq_work *next; 7 unsigned long flags;
8 struct llist_node llnode;
6 void (*func)(struct irq_work *); 9 void (*func)(struct irq_work *);
7}; 10};
8 11
9static inline 12static inline
10void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) 13void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
11{ 14{
12 entry->next = NULL; 15 work->flags = 0;
13 entry->func = func; 16 work->func = func;
14} 17}
15 18
16bool irq_work_queue(struct irq_work *entry); 19bool irq_work_queue(struct irq_work *work);
17void irq_work_run(void); 20void irq_work_run(void);
18void irq_work_sync(struct irq_work *entry); 21void irq_work_sync(struct irq_work *work);
19 22
20#endif /* _LINUX_IRQ_WORK_H */ 23#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/llist.h b/include/linux/llist.h
index aa0c8b5b3cd0..7287734e08d1 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -35,10 +35,30 @@
35 * 35 *
36 * The basic atomic operation of this list is cmpxchg on long. On 36 * The basic atomic operation of this list is cmpxchg on long. On
37 * architectures that don't have NMI-safe cmpxchg implementation, the 37 * architectures that don't have NMI-safe cmpxchg implementation, the
38 * list can NOT be used in NMI handler. So code uses the list in NMI 38 * list can NOT be used in NMI handlers. So code that uses the list in
39 * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 39 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
40 *
41 * Copyright 2010,2011 Intel Corp.
42 * Author: Huang Ying <ying.huang@intel.com>
43 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License version
46 * 2 as published by the Free Software Foundation;
47 *
48 * This program is distributed in the hope that it will be useful,
49 * but WITHOUT ANY WARRANTY; without even the implied warranty of
50 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
51 * GNU General Public License for more details.
52 *
53 * You should have received a copy of the GNU General Public License
54 * along with this program; if not, write to the Free Software
55 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40 */ 56 */
41 57
58#include <linux/kernel.h>
59#include <asm/system.h>
60#include <asm/processor.h>
61
42struct llist_head { 62struct llist_head {
43 struct llist_node *first; 63 struct llist_node *first;
44}; 64};
@@ -113,14 +133,55 @@ static inline void init_llist_head(struct llist_head *list)
113 * test whether the list is empty without deleting something from the 133 * test whether the list is empty without deleting something from the
114 * list. 134 * list.
115 */ 135 */
116static inline int llist_empty(const struct llist_head *head) 136static inline bool llist_empty(const struct llist_head *head)
117{ 137{
118 return ACCESS_ONCE(head->first) == NULL; 138 return ACCESS_ONCE(head->first) == NULL;
119} 139}
120 140
121void llist_add(struct llist_node *new, struct llist_head *head); 141static inline struct llist_node *llist_next(struct llist_node *node)
122void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, 142{
123 struct llist_head *head); 143 return node->next;
124struct llist_node *llist_del_first(struct llist_head *head); 144}
125struct llist_node *llist_del_all(struct llist_head *head); 145
146/**
147 * llist_add - add a new entry
148 * @new: new entry to be added
149 * @head: the head for your lock-less list
150 *
151 * Return whether list is empty before adding.
152 */
153static inline bool llist_add(struct llist_node *new, struct llist_head *head)
154{
155 struct llist_node *entry, *old_entry;
156
157 entry = head->first;
158 for (;;) {
159 old_entry = entry;
160 new->next = entry;
161 entry = cmpxchg(&head->first, old_entry, new);
162 if (entry == old_entry)
163 break;
164 }
165
166 return old_entry == NULL;
167}
168
169/**
170 * llist_del_all - delete all entries from lock-less list
171 * @head: the head of lock-less list to delete all entries
172 *
173 * If list is empty, return NULL, otherwise, delete all entries and
174 * return the pointer to the first entry. The order of entries
175 * deleted is from the newest to the oldest added one.
176 */
177static inline struct llist_node *llist_del_all(struct llist_head *head)
178{
179 return xchg(&head->first, NULL);
180}
181
182extern bool llist_add_batch(struct llist_node *new_first,
183 struct llist_node *new_last,
184 struct llist_head *head);
185extern struct llist_node *llist_del_first(struct llist_head *head);
186
126#endif /* LLIST_H */ 187#endif /* LLIST_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ede8a6585e38..e8acce717d2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -90,6 +90,7 @@ struct sched_param {
90#include <linux/task_io_accounting.h> 90#include <linux/task_io_accounting.h>
91#include <linux/latencytop.h> 91#include <linux/latencytop.h>
92#include <linux/cred.h> 92#include <linux/cred.h>
93#include <linux/llist.h>
93 94
94#include <asm/processor.h> 95#include <asm/processor.h>
95 96
@@ -1224,7 +1225,7 @@ struct task_struct {
1224 unsigned int ptrace; 1225 unsigned int ptrace;
1225 1226
1226#ifdef CONFIG_SMP 1227#ifdef CONFIG_SMP
1227 struct task_struct *wake_entry; 1228 struct llist_node wake_entry;
1228 int on_cpu; 1229 int on_cpu;
1229#endif 1230#endif
1230 int on_rq; 1231 int on_rq;
@@ -2035,6 +2036,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
2035static inline void sched_autogroup_exit(struct signal_struct *sig) { } 2036static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2036#endif 2037#endif
2037 2038
2039#ifdef CONFIG_CFS_BANDWIDTH
2040extern unsigned int sysctl_sched_cfs_bandwidth_slice;
2041#endif
2042
2038#ifdef CONFIG_RT_MUTEXES 2043#ifdef CONFIG_RT_MUTEXES
2039extern int rt_mutex_getprio(struct task_struct *p); 2044extern int rt_mutex_getprio(struct task_struct *p);
2040extern void rt_mutex_setprio(struct task_struct *p, int prio); 2045extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f6334782a593..959ff18b63b6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
100 * For all intents and purposes a preempted task is a running task. 100 * For all intents and purposes a preempted task is a running task.
101 */ 101 */
102 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) 102 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
103 state = TASK_RUNNING; 103 state = TASK_RUNNING | TASK_STATE_MAX;
104#endif 104#endif
105 105
106 return state; 106 return state;
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch,
137 __entry->next_prio = next->prio; 137 __entry->next_prio = next->prio;
138 ), 138 ),
139 139
140 TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", 140 TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
141 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, 141 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
142 __entry->prev_state ? 142 __entry->prev_state & (TASK_STATE_MAX-1) ?
143 __print_flags(__entry->prev_state, "|", 143 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
144 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, 144 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
145 { 16, "Z" }, { 32, "X" }, { 64, "x" }, 145 { 16, "Z" }, { 32, "X" }, { 64, "x" },
146 { 128, "W" }) : "R", 146 { 128, "W" }) : "R",
147 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
147 __entry->next_comm, __entry->next_pid, __entry->next_prio) 148 __entry->next_comm, __entry->next_pid, __entry->next_prio)
148); 149);
149 150
diff --git a/init/Kconfig b/init/Kconfig
index dc7e27bf89a8..31ba0fd0f36b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
715 depends on CGROUP_SCHED 715 depends on CGROUP_SCHED
716 default CGROUP_SCHED 716 default CGROUP_SCHED
717 717
718config CFS_BANDWIDTH
719 bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
720 depends on EXPERIMENTAL
721 depends on FAIR_GROUP_SCHED
722 default n
723 help
724 This option allows users to define CPU bandwidth rates (limits) for
725 tasks running within the fair group scheduler. Groups with no limit
726 set are considered to be unconstrained and will run with no
727 restriction.
728 See tip/Documentation/scheduler/sched-bwc.txt for more information.
729
718config RT_GROUP_SCHED 730config RT_GROUP_SCHED
719 bool "Group scheduling for SCHED_RR/FIFO" 731 bool "Group scheduling for SCHED_RR/FIFO"
720 depends on EXPERIMENTAL 732 depends on EXPERIMENTAL
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..0e2cde4f380b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,54 +17,34 @@
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued 17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback 18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */ 20 */
24 21
25#define IRQ_WORK_PENDING 1UL 22#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL 23#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL 24#define IRQ_WORK_FLAGS 3UL
28 25
29static inline bool irq_work_is_set(struct irq_work *entry, int flags) 26static DEFINE_PER_CPU(struct llist_head, irq_work_list);
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49 27
50/* 28/*
51 * Claim the entry so that no one else will poke at it. 29 * Claim the entry so that no one else will poke at it.
52 */ 30 */
53static bool irq_work_claim(struct irq_work *entry) 31static bool irq_work_claim(struct irq_work *work)
54{ 32{
55 struct irq_work *next, *nflags; 33 unsigned long flags, nflags;
56 34
57 do { 35 for (;;) {
58 next = entry->next; 36 flags = work->flags;
59 if ((unsigned long)next & IRQ_WORK_PENDING) 37 if (flags & IRQ_WORK_PENDING)
60 return false; 38 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS); 39 nflags = flags | IRQ_WORK_FLAGS;
62 } while (cmpxchg(&entry->next, next, nflags) != next); 40 if (cmpxchg(&work->flags, flags, nflags) == flags)
41 break;
42 cpu_relax();
43 }
63 44
64 return true; 45 return true;
65} 46}
66 47
67
68void __weak arch_irq_work_raise(void) 48void __weak arch_irq_work_raise(void)
69{ 49{
70 /* 50 /*
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void)
75/* 55/*
76 * Queue the entry and raise the IPI if needed. 56 * Queue the entry and raise the IPI if needed.
77 */ 57 */
78static void __irq_work_queue(struct irq_work *entry) 58static void __irq_work_queue(struct irq_work *work)
79{ 59{
80 struct irq_work *next; 60 bool empty;
81 61
82 preempt_disable(); 62 preempt_disable();
83 63
84 do { 64 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */ 65 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 66 if (empty)
92 arch_irq_work_raise(); 67 arch_irq_work_raise();
93 68
94 preempt_enable(); 69 preempt_enable();
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry)
100 * 75 *
101 * Can be re-enqueued while the callback is still in progress. 76 * Can be re-enqueued while the callback is still in progress.
102 */ 77 */
103bool irq_work_queue(struct irq_work *entry) 78bool irq_work_queue(struct irq_work *work)
104{ 79{
105 if (!irq_work_claim(entry)) { 80 if (!irq_work_claim(work)) {
106 /* 81 /*
107 * Already enqueued, can't do! 82 * Already enqueued, can't do!
108 */ 83 */
109 return false; 84 return false;
110 } 85 }
111 86
112 __irq_work_queue(entry); 87 __irq_work_queue(work);
113 return true; 88 return true;
114} 89}
115EXPORT_SYMBOL_GPL(irq_work_queue); 90EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 95 */
121void irq_work_run(void) 96void irq_work_run(void)
122{ 97{
123 struct irq_work *list; 98 struct irq_work *work;
99 struct llist_head *this_list;
100 struct llist_node *llnode;
124 101
125 if (this_cpu_read(irq_work_list) == NULL) 102 this_list = &__get_cpu_var(irq_work_list);
103 if (llist_empty(this_list))
126 return; 104 return;
127 105
128 BUG_ON(!in_irq()); 106 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled()); 107 BUG_ON(!irqs_disabled());
130 108
131 list = this_cpu_xchg(irq_work_list, NULL); 109 llnode = llist_del_all(this_list);
132 110 while (llnode != NULL) {
133 while (list != NULL) { 111 work = llist_entry(llnode, struct irq_work, llnode);
134 struct irq_work *entry = list;
135 112
136 list = irq_work_next(list); 113 llnode = llist_next(llnode);
137 114
138 /* 115 /*
139 * Clear the PENDING bit, after this point the @entry 116 * Clear the PENDING bit, after this point the @work
140 * can be re-used. 117 * can be re-used.
141 */ 118 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY); 119 work->flags = IRQ_WORK_BUSY;
143 entry->func(entry); 120 work->func(work);
144 /* 121 /*
145 * Clear the BUSY bit and return to the free state if 122 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 123 * no-one else claimed it meanwhile.
147 */ 124 */
148 (void)cmpxchg(&entry->next, 125 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 } 126 }
152} 127}
153EXPORT_SYMBOL_GPL(irq_work_run); 128EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
156 * Synchronize against the irq_work @entry, ensures the entry is not 131 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use. 132 * currently in use.
158 */ 133 */
159void irq_work_sync(struct irq_work *entry) 134void irq_work_sync(struct irq_work *work)
160{ 135{
161 WARN_ON_ONCE(irqs_disabled()); 136 WARN_ON_ONCE(irqs_disabled());
162 137
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 138 while (work->flags & IRQ_WORK_BUSY)
164 cpu_relax(); 139 cpu_relax();
165} 140}
166EXPORT_SYMBOL_GPL(irq_work_sync); 141EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/sched.c b/kernel/sched.c
index 03ad0113801a..d87c6e5d4e8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 196 return sysctl_sched_rt_runtime >= 0;
197} 197}
198 198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 200{
201 ktime_t now; 201 unsigned long delta;
202 ktime_t soft, hard, now;
203
204 for (;;) {
205 if (hrtimer_active(period_timer))
206 break;
207
208 now = hrtimer_cb_get_time(period_timer);
209 hrtimer_forward(period_timer, now, period);
202 210
211 soft = hrtimer_get_softexpires(period_timer);
212 hard = hrtimer_get_expires(period_timer);
213 delta = ktime_to_ns(ktime_sub(hard, soft));
214 __hrtimer_start_range_ns(period_timer, soft, delta,
215 HRTIMER_MODE_ABS_PINNED, 0);
216 }
217}
218
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 222 return;
205 223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 225 return;
208 226
209 raw_spin_lock(&rt_b->rt_runtime_lock); 227 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 229 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 230}
228 231
@@ -247,6 +250,24 @@ struct cfs_rq;
247 250
248static LIST_HEAD(task_groups); 251static LIST_HEAD(task_groups);
249 252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
250/* task group related information */ 271/* task group related information */
251struct task_group { 272struct task_group {
252 struct cgroup_subsys_state css; 273 struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 299#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 300 struct autogroup *autogroup;
280#endif 301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
281}; 304};
282 305
283/* task_group_lock serializes the addition/removal of task groups */ 306/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 334/* CFS-related fields in a runqueue */
312struct cfs_rq { 335struct cfs_rq {
313 struct load_weight load; 336 struct load_weight load;
314 unsigned long nr_running; 337 unsigned long nr_running, h_nr_running;
315 338
316 u64 exec_clock; 339 u64 exec_clock;
317 u64 min_vruntime; 340 u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
377 400
378 unsigned long load_contribution; 401 unsigned long load_contribution;
379#endif 402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
380#endif 412#endif
381}; 413};
382 414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
383/* Real-Time classes' related field in a runqueue: */ 517/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 518struct rt_rq {
385 struct rt_prio_array active; 519 struct rt_prio_array active;
@@ -510,7 +644,7 @@ struct rq {
510 644
511 unsigned long cpu_power; 645 unsigned long cpu_power;
512 646
513 unsigned char idle_at_tick; 647 unsigned char idle_balance;
514 /* For active balancing */ 648 /* For active balancing */
515 int post_schedule; 649 int post_schedule;
516 int active_balance; 650 int active_balance;
@@ -520,8 +654,6 @@ struct rq {
520 int cpu; 654 int cpu;
521 int online; 655 int online;
522 656
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 657 u64 rt_avg;
526 u64 age_stamp; 658 u64 age_stamp;
527 u64 idle_stamp; 659 u64 idle_stamp;
@@ -570,7 +702,7 @@ struct rq {
570#endif 702#endif
571 703
572#ifdef CONFIG_SMP 704#ifdef CONFIG_SMP
573 struct task_struct *wake_list; 705 struct llist_head wake_list;
574#endif 706#endif
575}; 707};
576 708
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 1404 smp_send_reschedule(cpu);
1273} 1405}
1274 1406
1407static inline bool got_nohz_idle_kick(void)
1408{
1409 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1410}
1411
1412#else /* CONFIG_NO_HZ */
1413
1414static inline bool got_nohz_idle_kick(void)
1415{
1416 return false;
1417}
1418
1275#endif /* CONFIG_NO_HZ */ 1419#endif /* CONFIG_NO_HZ */
1276 1420
1277static u64 sched_avg_period(void) 1421static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1615 update_load_sub(&rq->load, load);
1472} 1616}
1473 1617
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1618#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1619 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1620typedef int (*tg_visitor)(struct task_group *, void *);
1476 1621
1477/* 1622/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1623 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1624 * node and @up when leaving it for the final time.
1625 *
1626 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1627 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1628static int walk_tg_tree_from(struct task_group *from,
1629 tg_visitor down, tg_visitor up, void *data)
1482{ 1630{
1483 struct task_group *parent, *child; 1631 struct task_group *parent, *child;
1484 int ret; 1632 int ret;
1485 1633
1486 rcu_read_lock(); 1634 parent = from;
1487 parent = &root_task_group; 1635
1488down: 1636down:
1489 ret = (*down)(parent, data); 1637 ret = (*down)(parent, data);
1490 if (ret) 1638 if (ret)
1491 goto out_unlock; 1639 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1640 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1641 parent = child;
1494 goto down; 1642 goto down;
@@ -1497,19 +1645,29 @@ up:
1497 continue; 1645 continue;
1498 } 1646 }
1499 ret = (*up)(parent, data); 1647 ret = (*up)(parent, data);
1500 if (ret) 1648 if (ret || parent == from)
1501 goto out_unlock; 1649 goto out;
1502 1650
1503 child = parent; 1651 child = parent;
1504 parent = parent->parent; 1652 parent = parent->parent;
1505 if (parent) 1653 if (parent)
1506 goto up; 1654 goto up;
1507out_unlock: 1655out:
1508 rcu_read_unlock();
1509
1510 return ret; 1656 return ret;
1511} 1657}
1512 1658
1659/*
1660 * Iterate the full tree, calling @down when first entering a node and @up when
1661 * leaving it for the final time.
1662 *
1663 * Caller must hold rcu_lock or sufficient equivalent.
1664 */
1665
1666static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1667{
1668 return walk_tg_tree_from(&root_task_group, down, up, data);
1669}
1670
1513static int tg_nop(struct task_group *tg, void *data) 1671static int tg_nop(struct task_group *tg, void *data)
1514{ 1672{
1515 return 0; 1673 return 0;
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1727 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1728
1571 if (nr_running) 1729 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1730 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1731
1576 return rq->avg_load_per_task; 1732 return 0;
1577} 1733}
1578 1734
1579#ifdef CONFIG_PREEMPT 1735#ifdef CONFIG_PREEMPT
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1962 rq->nr_uninterruptible--;
1807 1963
1808 enqueue_task(rq, p, flags); 1964 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1965}
1811 1966
1812/* 1967/*
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1973 rq->nr_uninterruptible++;
1819 1974
1820 dequeue_task(rq, p, flags); 1975 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1976}
1823 1977
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1978#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 2544
2391 /* Look for allowed, online CPU in same node. */ 2545 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2546 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2547 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 2548 return dest_cpu;
2395 2549
2396 /* Any allowed, online CPU? */ 2550 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2551 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 2552 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 2553 return dest_cpu;
2400 2554
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 2585 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 2586 * not worry about this generic constraint ]
2433 */ 2587 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2588 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 2589 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 2590 cpu = select_fallback_rq(task_cpu(p), p);
2437 2591
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 2710}
2557 2711
2558#ifdef CONFIG_SMP 2712#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 2713static void sched_ttwu_pending(void)
2560{ 2714{
2561 struct rq *rq = this_rq(); 2715 struct rq *rq = this_rq();
2716 struct llist_node *llist = llist_del_all(&rq->wake_list);
2717 struct task_struct *p;
2562 2718
2563 raw_spin_lock(&rq->lock); 2719 raw_spin_lock(&rq->lock);
2564 2720
2565 while (list) { 2721 while (llist) {
2566 struct task_struct *p = list; 2722 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 2723 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 2724 ttwu_do_activate(rq, p, 0);
2569 } 2725 }
2570 2726
2571 raw_spin_unlock(&rq->lock); 2727 raw_spin_unlock(&rq->lock);
2572} 2728}
2573 2729
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 2730void scheduler_ipi(void)
2590{ 2731{
2591 struct rq *rq = this_rq(); 2732 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 2733 return;
2596 2734
2597 /* 2735 /*
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 2746 * somewhat pessimize the simple resched case.
2609 */ 2747 */
2610 irq_enter(); 2748 irq_enter();
2611 sched_ttwu_do_pending(list); 2749 sched_ttwu_pending();
2750
2751 /*
2752 * Check if someone kicked us for doing the nohz idle load balance.
2753 */
2754 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2755 this_rq()->idle_balance = 1;
2756 raise_softirq_irqoff(SCHED_SOFTIRQ);
2757 }
2612 irq_exit(); 2758 irq_exit();
2613} 2759}
2614 2760
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 2761static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 2762{
2617 struct rq *rq = cpu_rq(cpu); 2763 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 2764 smp_send_reschedule(cpu);
2631} 2765}
2632 2766
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2982 p->state = TASK_RUNNING;
2849 2983
2850 /* 2984 /*
2985 * Make sure we do not leak PI boosting priority to the child.
2986 */
2987 p->prio = current->normal_prio;
2988
2989 /*
2851 * Revert to default priority/policy on fork if requested. 2990 * Revert to default priority/policy on fork if requested.
2852 */ 2991 */
2853 if (unlikely(p->sched_reset_on_fork)) { 2992 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2993 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 2994 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 2995 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 2996 p->rt_priority = 0;
2862 set_load_weight(p); 2997 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 2998 p->static_prio = NICE_TO_PRIO(0);
2999
3000 p->prio = p->normal_prio = __normal_prio(p);
3001 set_load_weight(p);
2864 3002
2865 /* 3003 /*
2866 * We don't need the reset flag anymore after the fork. It has 3004 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3007 p->sched_reset_on_fork = 0;
2870 } 3008 }
2871 3009
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3010 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3011 p->sched_class = &fair_sched_class;
2879 3012
@@ -4116,7 +4249,7 @@ void scheduler_tick(void)
4116 perf_event_task_tick(); 4249 perf_event_task_tick();
4117 4250
4118#ifdef CONFIG_SMP 4251#ifdef CONFIG_SMP
4119 rq->idle_at_tick = idle_cpu(cpu); 4252 rq->idle_balance = idle_cpu(cpu);
4120 trigger_load_balance(rq, cpu); 4253 trigger_load_balance(rq, cpu);
4121#endif 4254#endif
4122} 4255}
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq)
4240 * Optimization: we know that if all tasks are in 4373 * Optimization: we know that if all tasks are in
4241 * the fair class we can call that function directly: 4374 * the fair class we can call that function directly:
4242 */ 4375 */
4243 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4376 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4244 p = fair_sched_class.pick_next_task(rq); 4377 p = fair_sched_class.pick_next_task(rq);
4245 if (likely(p)) 4378 if (likely(p))
4246 return p; 4379 return p;
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);
5026 */ 5159 */
5027int idle_cpu(int cpu) 5160int idle_cpu(int cpu)
5028{ 5161{
5029 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 5162 struct rq *rq = cpu_rq(cpu);
5163
5164 if (rq->curr != rq->idle)
5165 return 0;
5166
5167 if (rq->nr_running)
5168 return 0;
5169
5170#ifdef CONFIG_SMP
5171 if (!llist_empty(&rq->wake_list))
5172 return 0;
5173#endif
5174
5175 return 1;
5030} 5176}
5031 5177
5032/** 5178/**
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)
5876 printk(KERN_INFO 6022 printk(KERN_INFO
5877 " task PC stack pid father\n"); 6023 " task PC stack pid father\n");
5878#endif 6024#endif
5879 read_lock(&tasklist_lock); 6025 rcu_read_lock();
5880 do_each_thread(g, p) { 6026 do_each_thread(g, p) {
5881 /* 6027 /*
5882 * reset the NMI-timeout, listing all files on a slow 6028 * reset the NMI-timeout, listing all files on a slow
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)
5892#ifdef CONFIG_SCHED_DEBUG 6038#ifdef CONFIG_SCHED_DEBUG
5893 sysrq_sched_debug_show(); 6039 sysrq_sched_debug_show();
5894#endif 6040#endif
5895 read_unlock(&tasklist_lock); 6041 rcu_read_unlock();
5896 /* 6042 /*
5897 * Only show locks if all tasks are dumped: 6043 * Only show locks if all tasks are dumped:
5898 */ 6044 */
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6007{ 6153{
6008 if (p->sched_class && p->sched_class->set_cpus_allowed) 6154 if (p->sched_class && p->sched_class->set_cpus_allowed)
6009 p->sched_class->set_cpus_allowed(p, new_mask); 6155 p->sched_class->set_cpus_allowed(p, new_mask);
6010 else { 6156
6011 cpumask_copy(&p->cpus_allowed, new_mask); 6157 cpumask_copy(&p->cpus_allowed, new_mask);
6012 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 6158 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6013 }
6014} 6159}
6015 6160
6016/* 6161/*
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6108 if (task_cpu(p) != src_cpu) 6253 if (task_cpu(p) != src_cpu)
6109 goto done; 6254 goto done;
6110 /* Affinity changed (again). */ 6255 /* Affinity changed (again). */
6111 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6256 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6112 goto fail; 6257 goto fail;
6113 6258
6114 /* 6259 /*
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)
6189 rq->calc_load_active = 0; 6334 rq->calc_load_active = 0;
6190} 6335}
6191 6336
6337#ifdef CONFIG_CFS_BANDWIDTH
6338static void unthrottle_offline_cfs_rqs(struct rq *rq)
6339{
6340 struct cfs_rq *cfs_rq;
6341
6342 for_each_leaf_cfs_rq(rq, cfs_rq) {
6343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6344
6345 if (!cfs_rq->runtime_enabled)
6346 continue;
6347
6348 /*
6349 * clock_task is not advancing so we just need to make sure
6350 * there's some valid quota amount
6351 */
6352 cfs_rq->runtime_remaining = cfs_b->quota;
6353 if (cfs_rq_throttled(cfs_rq))
6354 unthrottle_cfs_rq(cfs_rq);
6355 }
6356}
6357#else
6358static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6359#endif
6360
6192/* 6361/*
6193 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6362 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6194 * try_to_wake_up()->select_task_rq(). 6363 * try_to_wake_up()->select_task_rq().
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6214 */ 6383 */
6215 rq->stop = NULL; 6384 rq->stop = NULL;
6216 6385
6386 /* Ensure any throttled groups are reachable by pick_next_task */
6387 unthrottle_offline_cfs_rqs(rq);
6388
6217 for ( ; ; ) { 6389 for ( ; ; ) {
6218 /* 6390 /*
6219 * There's this thread running, bail when that's the only 6391 * There's this thread running, bail when that's the only
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7957 /* allow initial update_cfs_load() to truncate */ 8129 /* allow initial update_cfs_load() to truncate */
7958 cfs_rq->load_stamp = 1; 8130 cfs_rq->load_stamp = 1;
7959#endif 8131#endif
8132 init_cfs_rq_runtime(cfs_rq);
7960 8133
7961 tg->cfs_rq[cpu] = cfs_rq; 8134 tg->cfs_rq[cpu] = cfs_rq;
7962 tg->se[cpu] = se; 8135 tg->se[cpu] = se;
@@ -8096,6 +8269,7 @@ void __init sched_init(void)
8096 * We achieve this by letting root_task_group's tasks sit 8269 * We achieve this by letting root_task_group's tasks sit
8097 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8270 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8098 */ 8271 */
8272 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8099 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8273 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8100#endif /* CONFIG_FAIR_GROUP_SCHED */ 8274#endif /* CONFIG_FAIR_GROUP_SCHED */
8101 8275
@@ -8125,7 +8299,6 @@ void __init sched_init(void)
8125 rq_attach_root(rq, &def_root_domain); 8299 rq_attach_root(rq, &def_root_domain);
8126#ifdef CONFIG_NO_HZ 8300#ifdef CONFIG_NO_HZ
8127 rq->nohz_balance_kick = 0; 8301 rq->nohz_balance_kick = 0;
8128 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8129#endif 8302#endif
8130#endif 8303#endif
8131 init_rq_hrtick(rq); 8304 init_rq_hrtick(rq);
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg)
8336{ 8509{
8337 int i; 8510 int i;
8338 8511
8512 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513
8339 for_each_possible_cpu(i) { 8514 for_each_possible_cpu(i) {
8340 if (tg->cfs_rq) 8515 if (tg->cfs_rq)
8341 kfree(tg->cfs_rq[i]); 8516 kfree(tg->cfs_rq[i]);
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8363 8538
8364 tg->shares = NICE_0_LOAD; 8539 tg->shares = NICE_0_LOAD;
8365 8540
8541 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8542
8366 for_each_possible_cpu(i) { 8543 for_each_possible_cpu(i) {
8367 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8544 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8368 GFP_KERNEL, cpu_to_node(i)); 8545 GFP_KERNEL, cpu_to_node(i));
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8638} 8815}
8639#endif 8816#endif
8640 8817
8641#ifdef CONFIG_RT_GROUP_SCHED 8818#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8642/*
8643 * Ensure that the real time constraints are schedulable.
8644 */
8645static DEFINE_MUTEX(rt_constraints_mutex);
8646
8647static unsigned long to_ratio(u64 period, u64 runtime) 8819static unsigned long to_ratio(u64 period, u64 runtime)
8648{ 8820{
8649 if (runtime == RUNTIME_INF) 8821 if (runtime == RUNTIME_INF)
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8651 8823
8652 return div64_u64(runtime << 20, period); 8824 return div64_u64(runtime << 20, period);
8653} 8825}
8826#endif
8827
8828#ifdef CONFIG_RT_GROUP_SCHED
8829/*
8830 * Ensure that the real time constraints are schedulable.
8831 */
8832static DEFINE_MUTEX(rt_constraints_mutex);
8654 8833
8655/* Must be called with tasklist_lock held */ 8834/* Must be called with tasklist_lock held */
8656static inline int tg_has_rt_tasks(struct task_group *tg) 8835static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data {
8671 u64 rt_runtime; 8850 u64 rt_runtime;
8672}; 8851};
8673 8852
8674static int tg_schedulable(struct task_group *tg, void *data) 8853static int tg_rt_schedulable(struct task_group *tg, void *data)
8675{ 8854{
8676 struct rt_schedulable_data *d = data; 8855 struct rt_schedulable_data *d = data;
8677 struct task_group *child; 8856 struct task_group *child;
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8729 8908
8730static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8909static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8731{ 8910{
8911 int ret;
8912
8732 struct rt_schedulable_data data = { 8913 struct rt_schedulable_data data = {
8733 .tg = tg, 8914 .tg = tg,
8734 .rt_period = period, 8915 .rt_period = period,
8735 .rt_runtime = runtime, 8916 .rt_runtime = runtime,
8736 }; 8917 };
8737 8918
8738 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8919 rcu_read_lock();
8920 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8921 rcu_read_unlock();
8922
8923 return ret;
8739} 8924}
8740 8925
8741static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_rt_bandwidth(struct task_group *tg,
8742 u64 rt_period, u64 rt_runtime) 8927 u64 rt_period, u64 rt_runtime)
8743{ 8928{
8744 int i, err = 0; 8929 int i, err = 0;
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8777 if (rt_runtime_us < 0) 8962 if (rt_runtime_us < 0)
8778 rt_runtime = RUNTIME_INF; 8963 rt_runtime = RUNTIME_INF;
8779 8964
8780 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8965 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8781} 8966}
8782 8967
8783long sched_group_rt_runtime(struct task_group *tg) 8968long sched_group_rt_runtime(struct task_group *tg)
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8802 if (rt_period == 0) 8987 if (rt_period == 0)
8803 return -EINVAL; 8988 return -EINVAL;
8804 8989
8805 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8806} 8991}
8807 8992
8808long sched_group_rt_period(struct task_group *tg) 8993long sched_group_rt_period(struct task_group *tg)
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8992 9177
8993 return (u64) scale_load_down(tg->shares); 9178 return (u64) scale_load_down(tg->shares);
8994} 9179}
9180
9181#ifdef CONFIG_CFS_BANDWIDTH
9182static DEFINE_MUTEX(cfs_constraints_mutex);
9183
9184const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9185const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9186
9187static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9188
9189static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9190{
9191 int i, ret = 0, runtime_enabled;
9192 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9193
9194 if (tg == &root_task_group)
9195 return -EINVAL;
9196
9197 /*
9198 * Ensure we have at some amount of bandwidth every period. This is
9199 * to prevent reaching a state of large arrears when throttled via
9200 * entity_tick() resulting in prolonged exit starvation.
9201 */
9202 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9203 return -EINVAL;
9204
9205 /*
9206 * Likewise, bound things on the otherside by preventing insane quota
9207 * periods. This also allows us to normalize in computing quota
9208 * feasibility.
9209 */
9210 if (period > max_cfs_quota_period)
9211 return -EINVAL;
9212
9213 mutex_lock(&cfs_constraints_mutex);
9214 ret = __cfs_schedulable(tg, period, quota);
9215 if (ret)
9216 goto out_unlock;
9217
9218 runtime_enabled = quota != RUNTIME_INF;
9219 raw_spin_lock_irq(&cfs_b->lock);
9220 cfs_b->period = ns_to_ktime(period);
9221 cfs_b->quota = quota;
9222
9223 __refill_cfs_bandwidth_runtime(cfs_b);
9224 /* restart the period timer (if active) to handle new period expiry */
9225 if (runtime_enabled && cfs_b->timer_active) {
9226 /* force a reprogram */
9227 cfs_b->timer_active = 0;
9228 __start_cfs_bandwidth(cfs_b);
9229 }
9230 raw_spin_unlock_irq(&cfs_b->lock);
9231
9232 for_each_possible_cpu(i) {
9233 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9234 struct rq *rq = rq_of(cfs_rq);
9235
9236 raw_spin_lock_irq(&rq->lock);
9237 cfs_rq->runtime_enabled = runtime_enabled;
9238 cfs_rq->runtime_remaining = 0;
9239
9240 if (cfs_rq_throttled(cfs_rq))
9241 unthrottle_cfs_rq(cfs_rq);
9242 raw_spin_unlock_irq(&rq->lock);
9243 }
9244out_unlock:
9245 mutex_unlock(&cfs_constraints_mutex);
9246
9247 return ret;
9248}
9249
9250int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9251{
9252 u64 quota, period;
9253
9254 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9255 if (cfs_quota_us < 0)
9256 quota = RUNTIME_INF;
9257 else
9258 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9259
9260 return tg_set_cfs_bandwidth(tg, period, quota);
9261}
9262
9263long tg_get_cfs_quota(struct task_group *tg)
9264{
9265 u64 quota_us;
9266
9267 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9268 return -1;
9269
9270 quota_us = tg_cfs_bandwidth(tg)->quota;
9271 do_div(quota_us, NSEC_PER_USEC);
9272
9273 return quota_us;
9274}
9275
9276int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9277{
9278 u64 quota, period;
9279
9280 period = (u64)cfs_period_us * NSEC_PER_USEC;
9281 quota = tg_cfs_bandwidth(tg)->quota;
9282
9283 if (period <= 0)
9284 return -EINVAL;
9285
9286 return tg_set_cfs_bandwidth(tg, period, quota);
9287}
9288
9289long tg_get_cfs_period(struct task_group *tg)
9290{
9291 u64 cfs_period_us;
9292
9293 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9294 do_div(cfs_period_us, NSEC_PER_USEC);
9295
9296 return cfs_period_us;
9297}
9298
9299static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9300{
9301 return tg_get_cfs_quota(cgroup_tg(cgrp));
9302}
9303
9304static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9305 s64 cfs_quota_us)
9306{
9307 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9308}
9309
9310static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9311{
9312 return tg_get_cfs_period(cgroup_tg(cgrp));
9313}
9314
9315static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9316 u64 cfs_period_us)
9317{
9318 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9319}
9320
9321struct cfs_schedulable_data {
9322 struct task_group *tg;
9323 u64 period, quota;
9324};
9325
9326/*
9327 * normalize group quota/period to be quota/max_period
9328 * note: units are usecs
9329 */
9330static u64 normalize_cfs_quota(struct task_group *tg,
9331 struct cfs_schedulable_data *d)
9332{
9333 u64 quota, period;
9334
9335 if (tg == d->tg) {
9336 period = d->period;
9337 quota = d->quota;
9338 } else {
9339 period = tg_get_cfs_period(tg);
9340 quota = tg_get_cfs_quota(tg);
9341 }
9342
9343 /* note: these should typically be equivalent */
9344 if (quota == RUNTIME_INF || quota == -1)
9345 return RUNTIME_INF;
9346
9347 return to_ratio(period, quota);
9348}
9349
9350static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9351{
9352 struct cfs_schedulable_data *d = data;
9353 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9354 s64 quota = 0, parent_quota = -1;
9355
9356 if (!tg->parent) {
9357 quota = RUNTIME_INF;
9358 } else {
9359 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9360
9361 quota = normalize_cfs_quota(tg, d);
9362 parent_quota = parent_b->hierarchal_quota;
9363
9364 /*
9365 * ensure max(child_quota) <= parent_quota, inherit when no
9366 * limit is set
9367 */
9368 if (quota == RUNTIME_INF)
9369 quota = parent_quota;
9370 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9371 return -EINVAL;
9372 }
9373 cfs_b->hierarchal_quota = quota;
9374
9375 return 0;
9376}
9377
9378static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9379{
9380 int ret;
9381 struct cfs_schedulable_data data = {
9382 .tg = tg,
9383 .period = period,
9384 .quota = quota,
9385 };
9386
9387 if (quota != RUNTIME_INF) {
9388 do_div(data.period, NSEC_PER_USEC);
9389 do_div(data.quota, NSEC_PER_USEC);
9390 }
9391
9392 rcu_read_lock();
9393 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9394 rcu_read_unlock();
9395
9396 return ret;
9397}
9398
9399static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9400 struct cgroup_map_cb *cb)
9401{
9402 struct task_group *tg = cgroup_tg(cgrp);
9403 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9404
9405 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9406 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9407 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9408
9409 return 0;
9410}
9411#endif /* CONFIG_CFS_BANDWIDTH */
8995#endif /* CONFIG_FAIR_GROUP_SCHED */ 9412#endif /* CONFIG_FAIR_GROUP_SCHED */
8996 9413
8997#ifdef CONFIG_RT_GROUP_SCHED 9414#ifdef CONFIG_RT_GROUP_SCHED
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = {
9026 .write_u64 = cpu_shares_write_u64, 9443 .write_u64 = cpu_shares_write_u64,
9027 }, 9444 },
9028#endif 9445#endif
9446#ifdef CONFIG_CFS_BANDWIDTH
9447 {
9448 .name = "cfs_quota_us",
9449 .read_s64 = cpu_cfs_quota_read_s64,
9450 .write_s64 = cpu_cfs_quota_write_s64,
9451 },
9452 {
9453 .name = "cfs_period_us",
9454 .read_u64 = cpu_cfs_period_read_u64,
9455 .write_u64 = cpu_cfs_period_write_u64,
9456 },
9457 {
9458 .name = "stat",
9459 .read_map = cpu_stats_show,
9460 },
9461#endif
9029#ifdef CONFIG_RT_GROUP_SCHED 9462#ifdef CONFIG_RT_GROUP_SCHED
9030 { 9463 {
9031 .name = "rt_runtime_us", 9464 .name = "rt_runtime_us",
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = {
9335 .subsys_id = cpuacct_subsys_id, 9768 .subsys_id = cpuacct_subsys_id,
9336}; 9769};
9337#endif /* CONFIG_CGROUP_CPUACCT */ 9770#endif /* CONFIG_CGROUP_CPUACCT */
9338
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..5c9e67923b7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 839
820 tg = cfs_rq->tg; 840 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 841 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 842 if (!se || throttled_hierarchy(cfs_rq))
823 return; 843 return;
824#ifndef CONFIG_SMP 844#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 845 if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 970 se->vruntime = vruntime;
951} 971}
952 972
973static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
974
953static void 975static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 976enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 977{
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1001 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1002 se->on_rq = 1;
981 1003
982 if (cfs_rq->nr_running == 1) 1004 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1005 list_add_leaf_cfs_rq(cfs_rq);
1006 check_enqueue_throttle(cfs_rq);
1007 }
984} 1008}
985 1009
986static void __clear_buddies_last(struct sched_entity *se) 1010static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1029} 1053}
1030 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1031static void 1057static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1059{
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1068 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1069 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1071} 1100}
@@ -1077,6 +1106,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1106check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1107{
1079 unsigned long ideal_runtime, delta_exec; 1108 unsigned long ideal_runtime, delta_exec;
1109 struct sched_entity *se;
1110 s64 delta;
1080 1111
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1112 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1113 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1126 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1127 * This also mitigates buddy induced latencies under load.
1097 */ 1128 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1129 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1130 return;
1103 1131
1104 if (cfs_rq->nr_running > 1) { 1132 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1133 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1134
1108 if (delta < 0) 1135 if (delta < 0)
1109 return; 1136 return;
1110 1137
1111 if (delta > ideal_runtime) 1138 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1139 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1140}
1115 1141
1116static void 1142static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1211 return se;
1186} 1212}
1187 1213
1214static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1215
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1216static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1217{
1190 /* 1218 /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1222 if (prev->on_rq)
1195 update_curr(cfs_rq); 1223 update_curr(cfs_rq);
1196 1224
1225 /* throttle cfs_rqs exceeding runtime */
1226 check_cfs_rq_runtime(cfs_rq);
1227
1197 check_spread(cfs_rq, prev); 1228 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1229 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1230 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1264 return;
1234#endif 1265#endif
1235 1266
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1267 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1268 check_preempt_tick(cfs_rq, curr);
1238} 1269}
1239 1270
1271
1272/**************************************************
1273 * CFS bandwidth control machinery
1274 */
1275
1276#ifdef CONFIG_CFS_BANDWIDTH
1277/*
1278 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds
1280 */
1281static inline u64 default_cfs_period(void)
1282{
1283 return 100000000ULL;
1284}
1285
1286static inline u64 sched_cfs_bandwidth_slice(void)
1287{
1288 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1289}
1290
1291/*
1292 * Replenish runtime according to assigned quota and update expiration time.
1293 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1294 * additional synchronization around rq->lock.
1295 *
1296 * requires cfs_b->lock
1297 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{
1300 u64 now;
1301
1302 if (cfs_b->quota == RUNTIME_INF)
1303 return;
1304
1305 now = sched_clock_cpu(smp_processor_id());
1306 cfs_b->runtime = cfs_b->quota;
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308}
1309
1310/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{
1313 struct task_group *tg = cfs_rq->tg;
1314 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1315 u64 amount = 0, min_amount, expires;
1316
1317 /* note: this is a positive sum as runtime_remaining <= 0 */
1318 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1319
1320 raw_spin_lock(&cfs_b->lock);
1321 if (cfs_b->quota == RUNTIME_INF)
1322 amount = min_amount;
1323 else {
1324 /*
1325 * If the bandwidth pool has become inactive, then at least one
1326 * period must have elapsed since the last consumption.
1327 * Refresh the global state and ensure bandwidth timer becomes
1328 * active.
1329 */
1330 if (!cfs_b->timer_active) {
1331 __refill_cfs_bandwidth_runtime(cfs_b);
1332 __start_cfs_bandwidth(cfs_b);
1333 }
1334
1335 if (cfs_b->runtime > 0) {
1336 amount = min(cfs_b->runtime, min_amount);
1337 cfs_b->runtime -= amount;
1338 cfs_b->idle = 0;
1339 }
1340 }
1341 expires = cfs_b->runtime_expires;
1342 raw_spin_unlock(&cfs_b->lock);
1343
1344 cfs_rq->runtime_remaining += amount;
1345 /*
1346 * we may have advanced our local expiration to account for allowed
1347 * spread between our sched_clock and the one on which runtime was
1348 * issued.
1349 */
1350 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1351 cfs_rq->runtime_expires = expires;
1352
1353 return cfs_rq->runtime_remaining > 0;
1354}
1355
1356/*
1357 * Note: This depends on the synchronization provided by sched_clock and the
1358 * fact that rq->clock snapshots this value.
1359 */
1360static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1361{
1362 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1363 struct rq *rq = rq_of(cfs_rq);
1364
1365 /* if the deadline is ahead of our clock, nothing to do */
1366 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1367 return;
1368
1369 if (cfs_rq->runtime_remaining < 0)
1370 return;
1371
1372 /*
1373 * If the local deadline has passed we have to consider the
1374 * possibility that our sched_clock is 'fast' and the global deadline
1375 * has not truly expired.
1376 *
1377 * Fortunately we can check determine whether this the case by checking
1378 * whether the global deadline has advanced.
1379 */
1380
1381 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1382 /* extend local deadline, drift is bounded above by 2 ticks */
1383 cfs_rq->runtime_expires += TICK_NSEC;
1384 } else {
1385 /* global deadline is ahead, expiration has passed */
1386 cfs_rq->runtime_remaining = 0;
1387 }
1388}
1389
1390static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1391 unsigned long delta_exec)
1392{
1393 /* dock delta_exec before expiring quota (as it could span periods) */
1394 cfs_rq->runtime_remaining -= delta_exec;
1395 expire_cfs_rq_runtime(cfs_rq);
1396
1397 if (likely(cfs_rq->runtime_remaining > 0))
1398 return;
1399
1400 /*
1401 * if we're unable to extend our runtime we resched so that the active
1402 * hierarchy can be throttled
1403 */
1404 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1405 resched_task(rq_of(cfs_rq)->curr);
1406}
1407
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec)
1410{
1411 if (!cfs_rq->runtime_enabled)
1412 return;
1413
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1415}
1416
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{
1419 return cfs_rq->throttled;
1420}
1421
1422/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{
1425 return cfs_rq->throttle_count;
1426}
1427
1428/*
1429 * Ensure that neither of the group entities corresponding to src_cpu or
1430 * dest_cpu are members of a throttled hierarchy when performing group
1431 * load-balance operations.
1432 */
1433static inline int throttled_lb_pair(struct task_group *tg,
1434 int src_cpu, int dest_cpu)
1435{
1436 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1437
1438 src_cfs_rq = tg->cfs_rq[src_cpu];
1439 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1440
1441 return throttled_hierarchy(src_cfs_rq) ||
1442 throttled_hierarchy(dest_cfs_rq);
1443}
1444
1445/* updated child weight may affect parent so we have to do this bottom up */
1446static int tg_unthrottle_up(struct task_group *tg, void *data)
1447{
1448 struct rq *rq = data;
1449 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1450
1451 cfs_rq->throttle_count--;
1452#ifdef CONFIG_SMP
1453 if (!cfs_rq->throttle_count) {
1454 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1455
1456 /* leaving throttled state, advance shares averaging windows */
1457 cfs_rq->load_stamp += delta;
1458 cfs_rq->load_last += delta;
1459
1460 /* update entity weight now that we are on_rq again */
1461 update_cfs_shares(cfs_rq);
1462 }
1463#endif
1464
1465 return 0;
1466}
1467
1468static int tg_throttle_down(struct task_group *tg, void *data)
1469{
1470 struct rq *rq = data;
1471 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1472
1473 /* group is entering throttled state, record last load */
1474 if (!cfs_rq->throttle_count)
1475 update_cfs_load(cfs_rq, 0);
1476 cfs_rq->throttle_count++;
1477
1478 return 0;
1479}
1480
1481static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1482{
1483 struct rq *rq = rq_of(cfs_rq);
1484 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1485 struct sched_entity *se;
1486 long task_delta, dequeue = 1;
1487
1488 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1489
1490 /* account load preceding throttle */
1491 rcu_read_lock();
1492 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1493 rcu_read_unlock();
1494
1495 task_delta = cfs_rq->h_nr_running;
1496 for_each_sched_entity(se) {
1497 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1498 /* throttled entity or throttle-on-deactivate */
1499 if (!se->on_rq)
1500 break;
1501
1502 if (dequeue)
1503 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1504 qcfs_rq->h_nr_running -= task_delta;
1505
1506 if (qcfs_rq->load.weight)
1507 dequeue = 0;
1508 }
1509
1510 if (!se)
1511 rq->nr_running -= task_delta;
1512
1513 cfs_rq->throttled = 1;
1514 cfs_rq->throttled_timestamp = rq->clock;
1515 raw_spin_lock(&cfs_b->lock);
1516 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1517 raw_spin_unlock(&cfs_b->lock);
1518}
1519
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{
1522 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1524 struct sched_entity *se;
1525 int enqueue = 1;
1526 long task_delta;
1527
1528 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1529
1530 cfs_rq->throttled = 0;
1531 raw_spin_lock(&cfs_b->lock);
1532 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1533 list_del_rcu(&cfs_rq->throttled_list);
1534 raw_spin_unlock(&cfs_b->lock);
1535 cfs_rq->throttled_timestamp = 0;
1536
1537 update_rq_clock(rq);
1538 /* update hierarchical throttle state */
1539 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1540
1541 if (!cfs_rq->load.weight)
1542 return;
1543
1544 task_delta = cfs_rq->h_nr_running;
1545 for_each_sched_entity(se) {
1546 if (se->on_rq)
1547 enqueue = 0;
1548
1549 cfs_rq = cfs_rq_of(se);
1550 if (enqueue)
1551 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1552 cfs_rq->h_nr_running += task_delta;
1553
1554 if (cfs_rq_throttled(cfs_rq))
1555 break;
1556 }
1557
1558 if (!se)
1559 rq->nr_running += task_delta;
1560
1561 /* determine whether we need to wake up potentially idle cpu */
1562 if (rq->curr == rq->idle && rq->cfs.nr_running)
1563 resched_task(rq->curr);
1564}
1565
1566static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1567 u64 remaining, u64 expires)
1568{
1569 struct cfs_rq *cfs_rq;
1570 u64 runtime = remaining;
1571
1572 rcu_read_lock();
1573 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1574 throttled_list) {
1575 struct rq *rq = rq_of(cfs_rq);
1576
1577 raw_spin_lock(&rq->lock);
1578 if (!cfs_rq_throttled(cfs_rq))
1579 goto next;
1580
1581 runtime = -cfs_rq->runtime_remaining + 1;
1582 if (runtime > remaining)
1583 runtime = remaining;
1584 remaining -= runtime;
1585
1586 cfs_rq->runtime_remaining += runtime;
1587 cfs_rq->runtime_expires = expires;
1588
1589 /* we check whether we're throttled above */
1590 if (cfs_rq->runtime_remaining > 0)
1591 unthrottle_cfs_rq(cfs_rq);
1592
1593next:
1594 raw_spin_unlock(&rq->lock);
1595
1596 if (!remaining)
1597 break;
1598 }
1599 rcu_read_unlock();
1600
1601 return remaining;
1602}
1603
1604/*
1605 * Responsible for refilling a task_group's bandwidth and unthrottling its
1606 * cfs_rqs as appropriate. If there has been no activity within the last
1607 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1608 * used to track this state.
1609 */
1610static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1611{
1612 u64 runtime, runtime_expires;
1613 int idle = 1, throttled;
1614
1615 raw_spin_lock(&cfs_b->lock);
1616 /* no need to continue the timer with no bandwidth constraint */
1617 if (cfs_b->quota == RUNTIME_INF)
1618 goto out_unlock;
1619
1620 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1621 /* idle depends on !throttled (for the case of a large deficit) */
1622 idle = cfs_b->idle && !throttled;
1623 cfs_b->nr_periods += overrun;
1624
1625 /* if we're going inactive then everything else can be deferred */
1626 if (idle)
1627 goto out_unlock;
1628
1629 __refill_cfs_bandwidth_runtime(cfs_b);
1630
1631 if (!throttled) {
1632 /* mark as potentially idle for the upcoming period */
1633 cfs_b->idle = 1;
1634 goto out_unlock;
1635 }
1636
1637 /* account preceding periods in which throttling occurred */
1638 cfs_b->nr_throttled += overrun;
1639
1640 /*
1641 * There are throttled entities so we must first use the new bandwidth
1642 * to unthrottle them before making it generally available. This
1643 * ensures that all existing debts will be paid before a new cfs_rq is
1644 * allowed to run.
1645 */
1646 runtime = cfs_b->runtime;
1647 runtime_expires = cfs_b->runtime_expires;
1648 cfs_b->runtime = 0;
1649
1650 /*
1651 * This check is repeated as we are holding onto the new bandwidth
1652 * while we unthrottle. This can potentially race with an unthrottled
1653 * group trying to acquire new bandwidth from the global pool.
1654 */
1655 while (throttled && runtime > 0) {
1656 raw_spin_unlock(&cfs_b->lock);
1657 /* we can't nest cfs_b->lock while distributing bandwidth */
1658 runtime = distribute_cfs_runtime(cfs_b, runtime,
1659 runtime_expires);
1660 raw_spin_lock(&cfs_b->lock);
1661
1662 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1663 }
1664
1665 /* return (any) remaining runtime */
1666 cfs_b->runtime = runtime;
1667 /*
1668 * While we are ensured activity in the period following an
1669 * unthrottle, this also covers the case in which the new bandwidth is
1670 * insufficient to cover the existing bandwidth deficit. (Forcing the
1671 * timer to remain active while there are any throttled entities.)
1672 */
1673 cfs_b->idle = 0;
1674out_unlock:
1675 if (idle)
1676 cfs_b->timer_active = 0;
1677 raw_spin_unlock(&cfs_b->lock);
1678
1679 return idle;
1680}
1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1784/*
1785 * When a group wakes up we want to make sure that its quota is not already
1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1787 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1788 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{
1791 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return;
1794
1795 /* ensure the group is not already throttled */
1796 if (cfs_rq_throttled(cfs_rq))
1797 return;
1798
1799 /* update runtime allocation */
1800 account_cfs_rq_runtime(cfs_rq, 0);
1801 if (cfs_rq->runtime_remaining <= 0)
1802 throttle_cfs_rq(cfs_rq);
1803}
1804
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return;
1810
1811 /*
1812 * it's possible for a throttled entity to be forced into a running
1813 * state (e.g. set_curr_task), in this case we're finished.
1814 */
1815 if (cfs_rq_throttled(cfs_rq))
1816 return;
1817
1818 throttle_cfs_rq(cfs_rq);
1819}
1820#else
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1826
1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1828{
1829 return 0;
1830}
1831
1832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1833{
1834 return 0;
1835}
1836
1837static inline int throttled_lb_pair(struct task_group *tg,
1838 int src_cpu, int dest_cpu)
1839{
1840 return 0;
1841}
1842#endif
1843
1240/************************************************** 1844/**************************************************
1241 * CFS operations on tasks: 1845 * CFS operations on tasks:
1242 */ 1846 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1917 break;
1314 cfs_rq = cfs_rq_of(se); 1918 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1919 enqueue_entity(cfs_rq, se, flags);
1920
1921 /*
1922 * end evaluation on encountering a throttled cfs_rq
1923 *
1924 * note: in the case of encountering a throttled cfs_rq we will
1925 * post the final h_nr_running increment below.
1926 */
1927 if (cfs_rq_throttled(cfs_rq))
1928 break;
1929 cfs_rq->h_nr_running++;
1930
1316 flags = ENQUEUE_WAKEUP; 1931 flags = ENQUEUE_WAKEUP;
1317 } 1932 }
1318 1933
1319 for_each_sched_entity(se) { 1934 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1935 cfs_rq = cfs_rq_of(se);
1936 cfs_rq->h_nr_running++;
1937
1938 if (cfs_rq_throttled(cfs_rq))
1939 break;
1321 1940
1322 update_cfs_load(cfs_rq, 0); 1941 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1942 update_cfs_shares(cfs_rq);
1324 } 1943 }
1325 1944
1945 if (!se)
1946 inc_nr_running(rq);
1326 hrtick_update(rq); 1947 hrtick_update(rq);
1327} 1948}
1328 1949
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1964 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1965 dequeue_entity(cfs_rq, se, flags);
1345 1966
1967 /*
1968 * end evaluation on encountering a throttled cfs_rq
1969 *
1970 * note: in the case of encountering a throttled cfs_rq we will
1971 * post the final h_nr_running decrement below.
1972 */
1973 if (cfs_rq_throttled(cfs_rq))
1974 break;
1975 cfs_rq->h_nr_running--;
1976
1346 /* Don't dequeue parent if it has other entities besides us */ 1977 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1978 if (cfs_rq->load.weight) {
1348 /* 1979 /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 1992
1362 for_each_sched_entity(se) { 1993 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 1994 cfs_rq = cfs_rq_of(se);
1995 cfs_rq->h_nr_running--;
1996
1997 if (cfs_rq_throttled(cfs_rq))
1998 break;
1364 1999
1365 update_cfs_load(cfs_rq, 0); 2000 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2001 update_cfs_shares(cfs_rq);
1367 } 2002 }
1368 2003
2004 if (!se)
2005 dec_nr_running(rq);
1369 hrtick_update(rq); 2006 hrtick_update(rq);
1370} 2007}
1371 2008
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1434 2071
1435 return wl; 2072 return wl;
1436} 2073}
1437
1438#else 2074#else
1439 2075
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2076static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2183
1548 /* Skip over this group if it has no CPUs allowed */ 2184 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2185 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2186 tsk_cpus_allowed(p)))
1551 continue; 2187 continue;
1552 2188
1553 local_group = cpumask_test_cpu(this_cpu, 2189 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2229 int i;
1594 2230
1595 /* Traverse only the allowed CPUs */ 2231 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2233 load = weighted_cpuload(i);
1598 2234
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2235 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1638 break; 2274 break;
1639 2275
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
1641 if (idle_cpu(i)) { 2277 if (idle_cpu(i)) {
1642 target = i; 2278 target = i;
1643 break; 2279 break;
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1680 int sync = wake_flags & WF_SYNC; 2316 int sync = wake_flags & WF_SYNC;
1681 2317
1682 if (sd_flag & SD_BALANCE_WAKE) { 2318 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2319 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2320 want_affine = 1;
1685 new_cpu = prev_cpu; 2321 new_cpu = prev_cpu;
1686 } 2322 }
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2511 if (unlikely(se == pse))
1876 return; 2512 return;
1877 2513
2514 /*
2515 * This is possible from callers such as pull_task(), in which we
2516 * unconditionally check_prempt_curr() after an enqueue (which may have
2517 * lead to a throttle). This both saves work and prevents false
2518 * next-buddy nomination below.
2519 */
2520 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2521 return;
2522
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2523 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2524 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2525 next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2528 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2529 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2530 * wake up path.
2531 *
2532 * Note: this also catches the edge-case of curr being in a throttled
2533 * group (e.g. via set_curr_task), since update_curr() (in the
2534 * enqueue of curr) will have resulted in resched being set. This
2535 * prevents us from potentially nominating it as a false LAST_BUDDY
2536 * below.
1886 */ 2537 */
1887 if (test_tsk_need_resched(curr)) 2538 if (test_tsk_need_resched(curr))
1888 return; 2539 return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2550 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2551 return;
1901 2552
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2553 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2554 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2555 BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2652{
2006 struct sched_entity *se = &p->se; 2653 struct sched_entity *se = &p->se;
2007 2654
2008 if (!se->on_rq) 2655 /* throttled hierarchies are not runnable */
2656 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2657 return false;
2010 2658
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2659 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2697 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 2698 * 3) are cache-hot on their current CPU.
2051 */ 2699 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 2700 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 2701 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 2702 return 0;
2055 } 2703 }
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2750
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2751 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2752 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2753 if (throttled_lb_pair(task_group(p),
2754 busiest->cpu, this_cpu))
2755 break;
2105 2756
2106 if (!can_migrate_task(p, busiest, this_cpu, 2757 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2758 sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2868 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2869 * list_add_leaf_cfs_rq() for details.
2219 */ 2870 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2871 for_each_leaf_cfs_rq(rq, cfs_rq) {
2872 /* throttled entities do not contribute to load */
2873 if (throttled_hierarchy(cfs_rq))
2874 continue;
2875
2221 update_shares_cpu(cfs_rq->tg, cpu); 2876 update_shares_cpu(cfs_rq->tg, cpu);
2877 }
2222 rcu_read_unlock(); 2878 rcu_read_unlock();
2223} 2879}
2224 2880
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 2924 u64 rem_load, moved_load;
2269 2925
2270 /* 2926 /*
2271 * empty group 2927 * empty group or part of a throttled hierarchy
2272 */ 2928 */
2273 if (!busiest_cfs_rq->task_weight) 2929 if (!busiest_cfs_rq->task_weight ||
2930 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 2931 continue;
2275 2932
2276 rem_load = (u64)rem_load_move * busiest_weight; 2933 rem_load = (u64)rem_load_move * busiest_weight;
@@ -3430,7 +4087,7 @@ redo:
3430 * moved to this_cpu 4087 * moved to this_cpu
3431 */ 4088 */
3432 if (!cpumask_test_cpu(this_cpu, 4089 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4090 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4091 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4092 flags);
3436 all_pinned = 1; 4093 all_pinned = 1;
@@ -3612,22 +4269,6 @@ out_unlock:
3612} 4269}
3613 4270
3614#ifdef CONFIG_NO_HZ 4271#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4272/*
3632 * idle load balancing details 4273 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while 4274 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4308 struct sched_domain *sd;
3668 4309
3669 for_each_domain(cpu, sd) 4310 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4311 if (sd->flags & flag)
3671 break; 4312 break;
3672 4313
3673 return sd; 4314 return sd;
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
3793 } 4434 }
3794 4435
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4436 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3796 struct call_single_data *cp;
3797
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4437 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4438
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4439 smp_mb();
4440 /*
4441 * Use smp_send_reschedule() instead of resched_cpu().
4442 * This way we generate a sched IPI on the target cpu which
4443 * is idle. And the softirq performing nohz idle load balance
4444 * will be run before returning from the IPI.
4445 */
4446 smp_send_reschedule(ilb_cpu);
3801 } 4447 }
3802 return; 4448 return;
3803} 4449}
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4030 if (time_before(now, nohz.next_balance)) 4676 if (time_before(now, nohz.next_balance))
4031 return 0; 4677 return 0;
4032 4678
4033 if (rq->idle_at_tick) 4679 if (idle_cpu(cpu))
4034 return 0; 4680 return 0;
4035 4681
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 4682 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 4712{
4067 int this_cpu = smp_processor_id(); 4713 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 4714 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4715 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 4716 CPU_IDLE : CPU_NOT_IDLE;
4071 4717
4072 rebalance_domains(this_cpu, idle); 4718 rebalance_domains(this_cpu, idle);
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4897{
4252 struct sched_entity *se = &rq->curr->se; 4898 struct sched_entity *se = &rq->curr->se;
4253 4899
4254 for_each_sched_entity(se) 4900 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4901 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4902
4903 set_next_entity(cfs_rq, se);
4904 /* ensure bandwidth has been allocated on our new cfs_rq */
4905 account_cfs_rq_runtime(cfs_rq, 0);
4906 }
4256} 4907}
4257 4908
4258#ifdef CONFIG_FAIR_GROUP_SCHED 4909#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, 1)
13 13
14/* 14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18
19/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
21 * a newly woken task on the same cpu as the task that woke it -- 16 * a newly woken task on the same cpu as the task that woke it --
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index af1177858be3..056cbd2e2a27 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 124 update_rt_migration(rt_rq);
125} 125}
126 126
127static inline int has_pushable_tasks(struct rq *rq)
128{
129 return !plist_head_empty(&rq->rt.pushable_tasks);
130}
131
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 133{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 134 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 135 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
137
138 /* Update the highest prio pushable task */
139 if (p->prio < rq->rt.highest_prio.next)
140 rq->rt.highest_prio.next = p->prio;
132} 141}
133 142
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 144{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 145 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 146
139static inline int has_pushable_tasks(struct rq *rq) 147 /* Update the new highest prio pushable task */
140{ 148 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 149 p = plist_first_entry(&rq->rt.pushable_tasks,
150 struct task_struct, pushable_tasks);
151 rq->rt.highest_prio.next = p->prio;
152 } else
153 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 154}
143 155
144#else 156#else
@@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
643 655
644 if (rt_rq->rt_time > runtime) { 656 if (rt_rq->rt_time > runtime) {
645 rt_rq->rt_throttled = 1; 657 rt_rq->rt_throttled = 1;
658 printk_once(KERN_WARNING "sched: RT throttling activated\n");
646 if (rt_rq_throttled(rt_rq)) { 659 if (rt_rq_throttled(rt_rq)) {
647 sched_rt_rq_dequeue(rt_rq); 660 sched_rt_rq_dequeue(rt_rq);
648 return 1; 661 return 1;
@@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq)
698 711
699#if defined CONFIG_SMP 712#if defined CONFIG_SMP
700 713
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 714static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 715inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 716{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 717 struct rq *rq = rq_of_rt_rq(rt_rq);
717 718
718 if (prio < prev_prio) { 719 if (rq->online && prio < prev_prio)
719 720 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 721}
743 722
744static void 723static void
@@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 725{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 726 struct rq *rq = rq_of_rt_rq(rt_rq);
748 727
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 728 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 729 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 730}
@@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 937
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 938 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 939 enqueue_pushable_task(rq, p);
940
941 inc_nr_running(rq);
964} 942}
965 943
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 944static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 949 dequeue_rt_entity(rt_se);
972 950
973 dequeue_pushable_task(rq, p); 951 dequeue_pushable_task(rq, p);
952
953 dec_nr_running(rq);
974} 954}
975 955
976/* 956/*
@@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 997 struct rq *rq;
1018 int cpu; 998 int cpu;
1019 999
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 1000 cpu = task_cpu(p);
1001
1002 /* For anything but wake ups, just return the task_cpu */
1003 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1004 goto out;
1005
1024 rq = cpu_rq(cpu); 1006 rq = cpu_rq(cpu);
1025 1007
1026 rcu_read_lock(); 1008 rcu_read_lock();
@@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1041 }
1060 rcu_read_unlock(); 1042 rcu_read_unlock();
1061 1043
1044out:
1062 return cpu; 1045 return cpu;
1063} 1046}
1064 1047
@@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1161static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1162{
1180 update_curr_rt(rq); 1163 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1164
1183 /* 1165 /*
1184 * The previous task needs to be made eligible for pushing 1166 * The previous task needs to be made eligible for pushing
@@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1180static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{ 1181{
1200 if (!task_running(rq, p) && 1182 if (!task_running(rq, p) &&
1201 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1183 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1202 (p->rt.nr_cpus_allowed > 1)) 1184 (p->rt.nr_cpus_allowed > 1))
1203 return 1; 1185 return 1;
1204 return 0; 1186 return 0;
@@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1343 */ 1325 */
1344 if (unlikely(task_rq(task) != rq || 1326 if (unlikely(task_rq(task) != rq ||
1345 !cpumask_test_cpu(lowest_rq->cpu, 1327 !cpumask_test_cpu(lowest_rq->cpu,
1346 &task->cpus_allowed) || 1328 tsk_cpus_allowed(task)) ||
1347 task_running(rq, task) || 1329 task_running(rq, task) ||
1348 !task->on_rq)) { 1330 !task->on_rq)) {
1349 1331
@@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1376{
1395 struct task_struct *next_task; 1377 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1378 struct rq *lowest_rq;
1379 int ret = 0;
1397 1380
1398 if (!rq->rt.overloaded) 1381 if (!rq->rt.overloaded)
1399 return 0; 1382 return 0;
@@ -1426,7 +1409,7 @@ retry:
1426 if (!lowest_rq) { 1409 if (!lowest_rq) {
1427 struct task_struct *task; 1410 struct task_struct *task;
1428 /* 1411 /*
1429 * find lock_lowest_rq releases rq->lock 1412 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1413 * so it is possible that next_task has migrated.
1431 * 1414 *
1432 * We need to make sure that the task is still on the same 1415 * We need to make sure that the task is still on the same
@@ -1436,12 +1419,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1419 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1420 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1421 /*
1439 * If we get here, the task hasn't moved at all, but 1422 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1423 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1424 * to push it to. Do not retry in this case, since
1442 * are ready. 1425 * other cpus will pull from us when ready.
1443 */ 1426 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1427 goto out;
1446 } 1428 }
1447 1429
@@ -1460,6 +1442,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1442 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1443 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1444 activate_task(lowest_rq, next_task, 0);
1445 ret = 1;
1463 1446
1464 resched_task(lowest_rq->curr); 1447 resched_task(lowest_rq->curr);
1465 1448
@@ -1468,7 +1451,7 @@ retry:
1468out: 1451out:
1469 put_task_struct(next_task); 1452 put_task_struct(next_task);
1470 1453
1471 return 1; 1454 return ret;
1472} 1455}
1473 1456
1474static void push_rt_tasks(struct rq *rq) 1457static void push_rt_tasks(struct rq *rq)
@@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1626 1609
1627 update_rt_migration(&rq->rt); 1610 update_rt_migration(&rq->rt);
1628 } 1611 }
1629
1630 cpumask_copy(&p->cpus_allowed, new_mask);
1631 p->rt.nr_cpus_allowed = weight;
1632} 1612}
1633 1613
1634/* Assumes rq->lock is held */ 1614/* Assumes rq->lock is held */
@@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 1843 rcu_read_unlock();
1864} 1844}
1865#endif /* CONFIG_SCHED_DEBUG */ 1845#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 inc_nr_running(rq);
37} 38}
38 39
39static void 40static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 42{
43 dec_nr_running(rq);
42} 44}
43 45
44static void yield_task_stop(struct rq *rq) 46static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 379 .extra2 = &one,
380 }, 380 },
381#endif 381#endif
382#ifdef CONFIG_CFS_BANDWIDTH
383 {
384 .procname = "sched_cfs_bandwidth_slice_us",
385 .data = &sysctl_sched_cfs_bandwidth_slice,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec_minmax,
389 .extra1 = &one,
390 },
391#endif
382#ifdef CONFIG_PROVE_LOCKING 392#ifdef CONFIG_PROVE_LOCKING
383 { 393 {
384 .procname = "prove_locking", 394 .procname = "prove_locking",
diff --git a/lib/Kconfig b/lib/Kconfig
index 6c695ff9caba..32f3e5ae2be5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -276,7 +276,4 @@ config CORDIC
276 so its calculations are in fixed point. Modules can select this 276 so its calculations are in fixed point. Modules can select this
277 when they require this function. Module will be called cordic. 277 when they require this function. Module will be called cordic.
278 278
279config LLIST
280 bool
281
282endmenu 279endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 3f5bc6d903e0..a4da283f5dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@ lib-y += kobject.o kref.o klist.o
22obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ 22obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
23 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ 23 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
24 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ 24 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
25 bsearch.o find_last_bit.o find_next_bit.o 25 bsearch.o find_last_bit.o find_next_bit.o llist.o
26obj-y += kstrtox.o 26obj-y += kstrtox.o
27obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o 27obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
28 28
@@ -115,8 +115,6 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
115 115
116obj-$(CONFIG_CORDIC) += cordic.o 116obj-$(CONFIG_CORDIC) += cordic.o
117 117
118obj-$(CONFIG_LLIST) += llist.o
119
120hostprogs-y := gen_crc32table 118hostprogs-y := gen_crc32table
121clean-files := crc32table.h 119clean-files := crc32table.h
122 120
diff --git a/lib/llist.c b/lib/llist.c
index da445724fa1f..700cff77a387 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -3,8 +3,8 @@
3 * 3 *
4 * The basic atomic operation of this list is cmpxchg on long. On 4 * The basic atomic operation of this list is cmpxchg on long. On
5 * architectures that don't have NMI-safe cmpxchg implementation, the 5 * architectures that don't have NMI-safe cmpxchg implementation, the
6 * list can NOT be used in NMI handler. So code uses the list in NMI 6 * list can NOT be used in NMI handlers. So code that uses the list in
7 * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. 7 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
8 * 8 *
9 * Copyright 2010,2011 Intel Corp. 9 * Copyright 2010,2011 Intel Corp.
10 * Author: Huang Ying <ying.huang@intel.com> 10 * Author: Huang Ying <ying.huang@intel.com>
@@ -30,48 +30,28 @@
30#include <asm/system.h> 30#include <asm/system.h>
31 31
32/** 32/**
33 * llist_add - add a new entry
34 * @new: new entry to be added
35 * @head: the head for your lock-less list
36 */
37void llist_add(struct llist_node *new, struct llist_head *head)
38{
39 struct llist_node *entry, *old_entry;
40
41#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
42 BUG_ON(in_nmi());
43#endif
44
45 entry = head->first;
46 do {
47 old_entry = entry;
48 new->next = entry;
49 cpu_relax();
50 } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
51}
52EXPORT_SYMBOL_GPL(llist_add);
53
54/**
55 * llist_add_batch - add several linked entries in batch 33 * llist_add_batch - add several linked entries in batch
56 * @new_first: first entry in batch to be added 34 * @new_first: first entry in batch to be added
57 * @new_last: last entry in batch to be added 35 * @new_last: last entry in batch to be added
58 * @head: the head for your lock-less list 36 * @head: the head for your lock-less list
37 *
38 * Return whether list is empty before adding.
59 */ 39 */
60void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, 40bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
61 struct llist_head *head) 41 struct llist_head *head)
62{ 42{
63 struct llist_node *entry, *old_entry; 43 struct llist_node *entry, *old_entry;
64 44
65#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
66 BUG_ON(in_nmi());
67#endif
68
69 entry = head->first; 45 entry = head->first;
70 do { 46 for (;;) {
71 old_entry = entry; 47 old_entry = entry;
72 new_last->next = entry; 48 new_last->next = entry;
73 cpu_relax(); 49 entry = cmpxchg(&head->first, old_entry, new_first);
74 } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); 50 if (entry == old_entry)
51 break;
52 }
53
54 return old_entry == NULL;
75} 55}
76EXPORT_SYMBOL_GPL(llist_add_batch); 56EXPORT_SYMBOL_GPL(llist_add_batch);
77 57
@@ -93,37 +73,17 @@ struct llist_node *llist_del_first(struct llist_head *head)
93{ 73{
94 struct llist_node *entry, *old_entry, *next; 74 struct llist_node *entry, *old_entry, *next;
95 75
96#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
97 BUG_ON(in_nmi());
98#endif
99
100 entry = head->first; 76 entry = head->first;
101 do { 77 for (;;) {
102 if (entry == NULL) 78 if (entry == NULL)
103 return NULL; 79 return NULL;
104 old_entry = entry; 80 old_entry = entry;
105 next = entry->next; 81 next = entry->next;
106 cpu_relax(); 82 entry = cmpxchg(&head->first, old_entry, next);
107 } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); 83 if (entry == old_entry)
84 break;
85 }
108 86
109 return entry; 87 return entry;
110} 88}
111EXPORT_SYMBOL_GPL(llist_del_first); 89EXPORT_SYMBOL_GPL(llist_del_first);
112
113/**
114 * llist_del_all - delete all entries from lock-less list
115 * @head: the head of lock-less list to delete all entries
116 *
117 * If list is empty, return NULL, otherwise, delete all entries and
118 * return the pointer to the first entry. The order of entries
119 * deleted is from the newest to the oldest added one.
120 */
121struct llist_node *llist_del_all(struct llist_head *head)
122{
123#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
124 BUG_ON(in_nmi());
125#endif
126
127 return xchg(&head->first, NULL);
128}
129EXPORT_SYMBOL_GPL(llist_del_all);
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4689cb073da4..503f087382a4 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -22,7 +22,7 @@ notrace unsigned int debug_smp_processor_id(void)
22 * Kernel threads bound to a single CPU can safely use 22 * Kernel threads bound to a single CPU can safely use
23 * smp_processor_id(): 23 * smp_processor_id():
24 */ 24 */
25 if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu))) 25 if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu)))
26 goto out; 26 goto out;
27 27
28 /* 28 /*