diff options
-rw-r--r-- | Documentation/scheduler/sched-bwc.txt | 122 | ||||
-rw-r--r-- | drivers/acpi/apei/Kconfig | 1 | ||||
-rw-r--r-- | include/linux/irq_work.h | 15 | ||||
-rw-r--r-- | include/linux/llist.h | 77 | ||||
-rw-r--r-- | include/linux/sched.h | 7 | ||||
-rw-r--r-- | include/trace/events/sched.h | 9 | ||||
-rw-r--r-- | init/Kconfig | 12 | ||||
-rw-r--r-- | kernel/irq_work.c | 91 | ||||
-rw-r--r-- | kernel/sched.c | 666 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 89 | ||||
-rw-r--r-- | kernel/sched_cpupri.h | 7 | ||||
-rw-r--r-- | kernel/sched_fair.c | 761 | ||||
-rw-r--r-- | kernel/sched_features.h | 5 | ||||
-rw-r--r-- | kernel/sched_rt.c | 99 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 | ||||
-rw-r--r-- | lib/Kconfig | 3 | ||||
-rw-r--r-- | lib/Makefile | 4 | ||||
-rw-r--r-- | lib/llist.c | 74 | ||||
-rw-r--r-- | lib/smp_processor_id.c | 2 |
20 files changed, 1646 insertions, 410 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt new file mode 100644 index 000000000000..f6b1873f68ab --- /dev/null +++ b/Documentation/scheduler/sched-bwc.txt | |||
@@ -0,0 +1,122 @@ | |||
1 | CFS Bandwidth Control | ||
2 | ===================== | ||
3 | |||
4 | [ This document only discusses CPU bandwidth control for SCHED_NORMAL. | ||
5 | The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] | ||
6 | |||
7 | CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the | ||
8 | specification of the maximum CPU bandwidth available to a group or hierarchy. | ||
9 | |||
10 | The bandwidth allowed for a group is specified using a quota and period. Within | ||
11 | each given "period" (microseconds), a group is allowed to consume only up to | ||
12 | "quota" microseconds of CPU time. When the CPU bandwidth consumption of a | ||
13 | group exceeds this limit (for that period), the tasks belonging to its | ||
14 | hierarchy will be throttled and are not allowed to run again until the next | ||
15 | period. | ||
16 | |||
17 | A group's unused runtime is globally tracked, being refreshed with quota units | ||
18 | above at each period boundary. As threads consume this bandwidth it is | ||
19 | transferred to cpu-local "silos" on a demand basis. The amount transferred | ||
20 | within each of these updates is tunable and described as the "slice". | ||
21 | |||
22 | Management | ||
23 | ---------- | ||
24 | Quota and period are managed within the cpu subsystem via cgroupfs. | ||
25 | |||
26 | cpu.cfs_quota_us: the total available run-time within a period (in microseconds) | ||
27 | cpu.cfs_period_us: the length of a period (in microseconds) | ||
28 | cpu.stat: exports throttling statistics [explained further below] | ||
29 | |||
30 | The default values are: | ||
31 | cpu.cfs_period_us=100ms | ||
32 | cpu.cfs_quota=-1 | ||
33 | |||
34 | A value of -1 for cpu.cfs_quota_us indicates that the group does not have any | ||
35 | bandwidth restriction in place, such a group is described as an unconstrained | ||
36 | bandwidth group. This represents the traditional work-conserving behavior for | ||
37 | CFS. | ||
38 | |||
39 | Writing any (valid) positive value(s) will enact the specified bandwidth limit. | ||
40 | The minimum quota allowed for the quota or period is 1ms. There is also an | ||
41 | upper bound on the period length of 1s. Additional restrictions exist when | ||
42 | bandwidth limits are used in a hierarchical fashion, these are explained in | ||
43 | more detail below. | ||
44 | |||
45 | Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit | ||
46 | and return the group to an unconstrained state once more. | ||
47 | |||
48 | Any updates to a group's bandwidth specification will result in it becoming | ||
49 | unthrottled if it is in a constrained state. | ||
50 | |||
51 | System wide settings | ||
52 | -------------------- | ||
53 | For efficiency run-time is transferred between the global pool and CPU local | ||
54 | "silos" in a batch fashion. This greatly reduces global accounting pressure | ||
55 | on large systems. The amount transferred each time such an update is required | ||
56 | is described as the "slice". | ||
57 | |||
58 | This is tunable via procfs: | ||
59 | /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) | ||
60 | |||
61 | Larger slice values will reduce transfer overheads, while smaller values allow | ||
62 | for more fine-grained consumption. | ||
63 | |||
64 | Statistics | ||
65 | ---------- | ||
66 | A group's bandwidth statistics are exported via 3 fields in cpu.stat. | ||
67 | |||
68 | cpu.stat: | ||
69 | - nr_periods: Number of enforcement intervals that have elapsed. | ||
70 | - nr_throttled: Number of times the group has been throttled/limited. | ||
71 | - throttled_time: The total time duration (in nanoseconds) for which entities | ||
72 | of the group have been throttled. | ||
73 | |||
74 | This interface is read-only. | ||
75 | |||
76 | Hierarchical considerations | ||
77 | --------------------------- | ||
78 | The interface enforces that an individual entity's bandwidth is always | ||
79 | attainable, that is: max(c_i) <= C. However, over-subscription in the | ||
80 | aggregate case is explicitly allowed to enable work-conserving semantics | ||
81 | within a hierarchy. | ||
82 | e.g. \Sum (c_i) may exceed C | ||
83 | [ Where C is the parent's bandwidth, and c_i its children ] | ||
84 | |||
85 | |||
86 | There are two ways in which a group may become throttled: | ||
87 | a. it fully consumes its own quota within a period | ||
88 | b. a parent's quota is fully consumed within its period | ||
89 | |||
90 | In case b) above, even though the child may have runtime remaining it will not | ||
91 | be allowed to until the parent's runtime is refreshed. | ||
92 | |||
93 | Examples | ||
94 | -------- | ||
95 | 1. Limit a group to 1 CPU worth of runtime. | ||
96 | |||
97 | If period is 250ms and quota is also 250ms, the group will get | ||
98 | 1 CPU worth of runtime every 250ms. | ||
99 | |||
100 | # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ | ||
101 | # echo 250000 > cpu.cfs_period_us /* period = 250ms */ | ||
102 | |||
103 | 2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. | ||
104 | |||
105 | With 500ms period and 1000ms quota, the group can get 2 CPUs worth of | ||
106 | runtime every 500ms. | ||
107 | |||
108 | # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ | ||
109 | # echo 500000 > cpu.cfs_period_us /* period = 500ms */ | ||
110 | |||
111 | The larger period here allows for increased burst capacity. | ||
112 | |||
113 | 3. Limit a group to 20% of 1 CPU. | ||
114 | |||
115 | With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. | ||
116 | |||
117 | # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ | ||
118 | # echo 50000 > cpu.cfs_period_us /* period = 50ms */ | ||
119 | |||
120 | By using a small period here we are ensuring a consistent latency | ||
121 | response at the expense of burst capacity. | ||
122 | |||
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index e3f47872ec22..f0c1ce95a0ec 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig | |||
@@ -14,7 +14,6 @@ config ACPI_APEI_GHES | |||
14 | depends on ACPI_APEI && X86 | 14 | depends on ACPI_APEI && X86 |
15 | select ACPI_HED | 15 | select ACPI_HED |
16 | select IRQ_WORK | 16 | select IRQ_WORK |
17 | select LLIST | ||
18 | select GENERIC_ALLOCATOR | 17 | select GENERIC_ALLOCATOR |
19 | help | 18 | help |
20 | Generic Hardware Error Source provides a way to report | 19 | Generic Hardware Error Source provides a way to report |
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 4fa09d4d0b71..6a9e8f5399e2 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h | |||
@@ -1,20 +1,23 @@ | |||
1 | #ifndef _LINUX_IRQ_WORK_H | 1 | #ifndef _LINUX_IRQ_WORK_H |
2 | #define _LINUX_IRQ_WORK_H | 2 | #define _LINUX_IRQ_WORK_H |
3 | 3 | ||
4 | #include <linux/llist.h> | ||
5 | |||
4 | struct irq_work { | 6 | struct irq_work { |
5 | struct irq_work *next; | 7 | unsigned long flags; |
8 | struct llist_node llnode; | ||
6 | void (*func)(struct irq_work *); | 9 | void (*func)(struct irq_work *); |
7 | }; | 10 | }; |
8 | 11 | ||
9 | static inline | 12 | static inline |
10 | void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) | 13 | void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) |
11 | { | 14 | { |
12 | entry->next = NULL; | 15 | work->flags = 0; |
13 | entry->func = func; | 16 | work->func = func; |
14 | } | 17 | } |
15 | 18 | ||
16 | bool irq_work_queue(struct irq_work *entry); | 19 | bool irq_work_queue(struct irq_work *work); |
17 | void irq_work_run(void); | 20 | void irq_work_run(void); |
18 | void irq_work_sync(struct irq_work *entry); | 21 | void irq_work_sync(struct irq_work *work); |
19 | 22 | ||
20 | #endif /* _LINUX_IRQ_WORK_H */ | 23 | #endif /* _LINUX_IRQ_WORK_H */ |
diff --git a/include/linux/llist.h b/include/linux/llist.h index aa0c8b5b3cd0..7287734e08d1 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h | |||
@@ -35,10 +35,30 @@ | |||
35 | * | 35 | * |
36 | * The basic atomic operation of this list is cmpxchg on long. On | 36 | * The basic atomic operation of this list is cmpxchg on long. On |
37 | * architectures that don't have NMI-safe cmpxchg implementation, the | 37 | * architectures that don't have NMI-safe cmpxchg implementation, the |
38 | * list can NOT be used in NMI handler. So code uses the list in NMI | 38 | * list can NOT be used in NMI handlers. So code that uses the list in |
39 | * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. | 39 | * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. |
40 | * | ||
41 | * Copyright 2010,2011 Intel Corp. | ||
42 | * Author: Huang Ying <ying.huang@intel.com> | ||
43 | * | ||
44 | * This program is free software; you can redistribute it and/or | ||
45 | * modify it under the terms of the GNU General Public License version | ||
46 | * 2 as published by the Free Software Foundation; | ||
47 | * | ||
48 | * This program is distributed in the hope that it will be useful, | ||
49 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
50 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
51 | * GNU General Public License for more details. | ||
52 | * | ||
53 | * You should have received a copy of the GNU General Public License | ||
54 | * along with this program; if not, write to the Free Software | ||
55 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
40 | */ | 56 | */ |
41 | 57 | ||
58 | #include <linux/kernel.h> | ||
59 | #include <asm/system.h> | ||
60 | #include <asm/processor.h> | ||
61 | |||
42 | struct llist_head { | 62 | struct llist_head { |
43 | struct llist_node *first; | 63 | struct llist_node *first; |
44 | }; | 64 | }; |
@@ -113,14 +133,55 @@ static inline void init_llist_head(struct llist_head *list) | |||
113 | * test whether the list is empty without deleting something from the | 133 | * test whether the list is empty without deleting something from the |
114 | * list. | 134 | * list. |
115 | */ | 135 | */ |
116 | static inline int llist_empty(const struct llist_head *head) | 136 | static inline bool llist_empty(const struct llist_head *head) |
117 | { | 137 | { |
118 | return ACCESS_ONCE(head->first) == NULL; | 138 | return ACCESS_ONCE(head->first) == NULL; |
119 | } | 139 | } |
120 | 140 | ||
121 | void llist_add(struct llist_node *new, struct llist_head *head); | 141 | static inline struct llist_node *llist_next(struct llist_node *node) |
122 | void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, | 142 | { |
123 | struct llist_head *head); | 143 | return node->next; |
124 | struct llist_node *llist_del_first(struct llist_head *head); | 144 | } |
125 | struct llist_node *llist_del_all(struct llist_head *head); | 145 | |
146 | /** | ||
147 | * llist_add - add a new entry | ||
148 | * @new: new entry to be added | ||
149 | * @head: the head for your lock-less list | ||
150 | * | ||
151 | * Return whether list is empty before adding. | ||
152 | */ | ||
153 | static inline bool llist_add(struct llist_node *new, struct llist_head *head) | ||
154 | { | ||
155 | struct llist_node *entry, *old_entry; | ||
156 | |||
157 | entry = head->first; | ||
158 | for (;;) { | ||
159 | old_entry = entry; | ||
160 | new->next = entry; | ||
161 | entry = cmpxchg(&head->first, old_entry, new); | ||
162 | if (entry == old_entry) | ||
163 | break; | ||
164 | } | ||
165 | |||
166 | return old_entry == NULL; | ||
167 | } | ||
168 | |||
169 | /** | ||
170 | * llist_del_all - delete all entries from lock-less list | ||
171 | * @head: the head of lock-less list to delete all entries | ||
172 | * | ||
173 | * If list is empty, return NULL, otherwise, delete all entries and | ||
174 | * return the pointer to the first entry. The order of entries | ||
175 | * deleted is from the newest to the oldest added one. | ||
176 | */ | ||
177 | static inline struct llist_node *llist_del_all(struct llist_head *head) | ||
178 | { | ||
179 | return xchg(&head->first, NULL); | ||
180 | } | ||
181 | |||
182 | extern bool llist_add_batch(struct llist_node *new_first, | ||
183 | struct llist_node *new_last, | ||
184 | struct llist_head *head); | ||
185 | extern struct llist_node *llist_del_first(struct llist_head *head); | ||
186 | |||
126 | #endif /* LLIST_H */ | 187 | #endif /* LLIST_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index ede8a6585e38..e8acce717d2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -90,6 +90,7 @@ struct sched_param { | |||
90 | #include <linux/task_io_accounting.h> | 90 | #include <linux/task_io_accounting.h> |
91 | #include <linux/latencytop.h> | 91 | #include <linux/latencytop.h> |
92 | #include <linux/cred.h> | 92 | #include <linux/cred.h> |
93 | #include <linux/llist.h> | ||
93 | 94 | ||
94 | #include <asm/processor.h> | 95 | #include <asm/processor.h> |
95 | 96 | ||
@@ -1224,7 +1225,7 @@ struct task_struct { | |||
1224 | unsigned int ptrace; | 1225 | unsigned int ptrace; |
1225 | 1226 | ||
1226 | #ifdef CONFIG_SMP | 1227 | #ifdef CONFIG_SMP |
1227 | struct task_struct *wake_entry; | 1228 | struct llist_node wake_entry; |
1228 | int on_cpu; | 1229 | int on_cpu; |
1229 | #endif | 1230 | #endif |
1230 | int on_rq; | 1231 | int on_rq; |
@@ -2035,6 +2036,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } | |||
2035 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } | 2036 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } |
2036 | #endif | 2037 | #endif |
2037 | 2038 | ||
2039 | #ifdef CONFIG_CFS_BANDWIDTH | ||
2040 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; | ||
2041 | #endif | ||
2042 | |||
2038 | #ifdef CONFIG_RT_MUTEXES | 2043 | #ifdef CONFIG_RT_MUTEXES |
2039 | extern int rt_mutex_getprio(struct task_struct *p); | 2044 | extern int rt_mutex_getprio(struct task_struct *p); |
2040 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 2045 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f6334782a593..959ff18b63b6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) | |||
100 | * For all intents and purposes a preempted task is a running task. | 100 | * For all intents and purposes a preempted task is a running task. |
101 | */ | 101 | */ |
102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) | 102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) |
103 | state = TASK_RUNNING; | 103 | state = TASK_RUNNING | TASK_STATE_MAX; |
104 | #endif | 104 | #endif |
105 | 105 | ||
106 | return state; | 106 | return state; |
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch, | |||
137 | __entry->next_prio = next->prio; | 137 | __entry->next_prio = next->prio; |
138 | ), | 138 | ), |
139 | 139 | ||
140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", | 140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", |
141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, | 141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, |
142 | __entry->prev_state ? | 142 | __entry->prev_state & (TASK_STATE_MAX-1) ? |
143 | __print_flags(__entry->prev_state, "|", | 143 | __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", |
144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, | 144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, |
145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, | 145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, |
146 | { 128, "W" }) : "R", | 146 | { 128, "W" }) : "R", |
147 | __entry->prev_state & TASK_STATE_MAX ? "+" : "", | ||
147 | __entry->next_comm, __entry->next_pid, __entry->next_prio) | 148 | __entry->next_comm, __entry->next_pid, __entry->next_prio) |
148 | ); | 149 | ); |
149 | 150 | ||
diff --git a/init/Kconfig b/init/Kconfig index dc7e27bf89a8..31ba0fd0f36b 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED | |||
715 | depends on CGROUP_SCHED | 715 | depends on CGROUP_SCHED |
716 | default CGROUP_SCHED | 716 | default CGROUP_SCHED |
717 | 717 | ||
718 | config CFS_BANDWIDTH | ||
719 | bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" | ||
720 | depends on EXPERIMENTAL | ||
721 | depends on FAIR_GROUP_SCHED | ||
722 | default n | ||
723 | help | ||
724 | This option allows users to define CPU bandwidth rates (limits) for | ||
725 | tasks running within the fair group scheduler. Groups with no limit | ||
726 | set are considered to be unconstrained and will run with no | ||
727 | restriction. | ||
728 | See tip/Documentation/scheduler/sched-bwc.txt for more information. | ||
729 | |||
718 | config RT_GROUP_SCHED | 730 | config RT_GROUP_SCHED |
719 | bool "Group scheduling for SCHED_RR/FIFO" | 731 | bool "Group scheduling for SCHED_RR/FIFO" |
720 | depends on EXPERIMENTAL | 732 | depends on EXPERIMENTAL |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8aef..0e2cde4f380b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -17,54 +17,34 @@ | |||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | 17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued |
18 | * pending next, 3 -> {busy} : queued, pending callback | 18 | * pending next, 3 -> {busy} : queued, pending callback |
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | 19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed |
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | 20 | */ |
24 | 21 | ||
25 | #define IRQ_WORK_PENDING 1UL | 22 | #define IRQ_WORK_PENDING 1UL |
26 | #define IRQ_WORK_BUSY 2UL | 23 | #define IRQ_WORK_BUSY 2UL |
27 | #define IRQ_WORK_FLAGS 3UL | 24 | #define IRQ_WORK_FLAGS 3UL |
28 | 25 | ||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | 26 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); |
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | 27 | ||
50 | /* | 28 | /* |
51 | * Claim the entry so that no one else will poke at it. | 29 | * Claim the entry so that no one else will poke at it. |
52 | */ | 30 | */ |
53 | static bool irq_work_claim(struct irq_work *entry) | 31 | static bool irq_work_claim(struct irq_work *work) |
54 | { | 32 | { |
55 | struct irq_work *next, *nflags; | 33 | unsigned long flags, nflags; |
56 | 34 | ||
57 | do { | 35 | for (;;) { |
58 | next = entry->next; | 36 | flags = work->flags; |
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | 37 | if (flags & IRQ_WORK_PENDING) |
60 | return false; | 38 | return false; |
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | 39 | nflags = flags | IRQ_WORK_FLAGS; |
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | 40 | if (cmpxchg(&work->flags, flags, nflags) == flags) |
41 | break; | ||
42 | cpu_relax(); | ||
43 | } | ||
63 | 44 | ||
64 | return true; | 45 | return true; |
65 | } | 46 | } |
66 | 47 | ||
67 | |||
68 | void __weak arch_irq_work_raise(void) | 48 | void __weak arch_irq_work_raise(void) |
69 | { | 49 | { |
70 | /* | 50 | /* |
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void) | |||
75 | /* | 55 | /* |
76 | * Queue the entry and raise the IPI if needed. | 56 | * Queue the entry and raise the IPI if needed. |
77 | */ | 57 | */ |
78 | static void __irq_work_queue(struct irq_work *entry) | 58 | static void __irq_work_queue(struct irq_work *work) |
79 | { | 59 | { |
80 | struct irq_work *next; | 60 | bool empty; |
81 | 61 | ||
82 | preempt_disable(); | 62 | preempt_disable(); |
83 | 63 | ||
84 | do { | 64 | empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); |
85 | next = __this_cpu_read(irq_work_list); | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | 65 | /* The list was empty, raise self-interrupt to start processing. */ |
91 | if (!irq_work_next(entry)) | 66 | if (empty) |
92 | arch_irq_work_raise(); | 67 | arch_irq_work_raise(); |
93 | 68 | ||
94 | preempt_enable(); | 69 | preempt_enable(); |
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry) | |||
100 | * | 75 | * |
101 | * Can be re-enqueued while the callback is still in progress. | 76 | * Can be re-enqueued while the callback is still in progress. |
102 | */ | 77 | */ |
103 | bool irq_work_queue(struct irq_work *entry) | 78 | bool irq_work_queue(struct irq_work *work) |
104 | { | 79 | { |
105 | if (!irq_work_claim(entry)) { | 80 | if (!irq_work_claim(work)) { |
106 | /* | 81 | /* |
107 | * Already enqueued, can't do! | 82 | * Already enqueued, can't do! |
108 | */ | 83 | */ |
109 | return false; | 84 | return false; |
110 | } | 85 | } |
111 | 86 | ||
112 | __irq_work_queue(entry); | 87 | __irq_work_queue(work); |
113 | return true; | 88 | return true; |
114 | } | 89 | } |
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | 90 | EXPORT_SYMBOL_GPL(irq_work_queue); |
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
120 | */ | 95 | */ |
121 | void irq_work_run(void) | 96 | void irq_work_run(void) |
122 | { | 97 | { |
123 | struct irq_work *list; | 98 | struct irq_work *work; |
99 | struct llist_head *this_list; | ||
100 | struct llist_node *llnode; | ||
124 | 101 | ||
125 | if (this_cpu_read(irq_work_list) == NULL) | 102 | this_list = &__get_cpu_var(irq_work_list); |
103 | if (llist_empty(this_list)) | ||
126 | return; | 104 | return; |
127 | 105 | ||
128 | BUG_ON(!in_irq()); | 106 | BUG_ON(!in_irq()); |
129 | BUG_ON(!irqs_disabled()); | 107 | BUG_ON(!irqs_disabled()); |
130 | 108 | ||
131 | list = this_cpu_xchg(irq_work_list, NULL); | 109 | llnode = llist_del_all(this_list); |
132 | 110 | while (llnode != NULL) { | |
133 | while (list != NULL) { | 111 | work = llist_entry(llnode, struct irq_work, llnode); |
134 | struct irq_work *entry = list; | ||
135 | 112 | ||
136 | list = irq_work_next(list); | 113 | llnode = llist_next(llnode); |
137 | 114 | ||
138 | /* | 115 | /* |
139 | * Clear the PENDING bit, after this point the @entry | 116 | * Clear the PENDING bit, after this point the @work |
140 | * can be re-used. | 117 | * can be re-used. |
141 | */ | 118 | */ |
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | 119 | work->flags = IRQ_WORK_BUSY; |
143 | entry->func(entry); | 120 | work->func(work); |
144 | /* | 121 | /* |
145 | * Clear the BUSY bit and return to the free state if | 122 | * Clear the BUSY bit and return to the free state if |
146 | * no-one else claimed it meanwhile. | 123 | * no-one else claimed it meanwhile. |
147 | */ | 124 | */ |
148 | (void)cmpxchg(&entry->next, | 125 | (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); |
149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
150 | NULL); | ||
151 | } | 126 | } |
152 | } | 127 | } |
153 | EXPORT_SYMBOL_GPL(irq_work_run); | 128 | EXPORT_SYMBOL_GPL(irq_work_run); |
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); | |||
156 | * Synchronize against the irq_work @entry, ensures the entry is not | 131 | * Synchronize against the irq_work @entry, ensures the entry is not |
157 | * currently in use. | 132 | * currently in use. |
158 | */ | 133 | */ |
159 | void irq_work_sync(struct irq_work *entry) | 134 | void irq_work_sync(struct irq_work *work) |
160 | { | 135 | { |
161 | WARN_ON_ONCE(irqs_disabled()); | 136 | WARN_ON_ONCE(irqs_disabled()); |
162 | 137 | ||
163 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | 138 | while (work->flags & IRQ_WORK_BUSY) |
164 | cpu_relax(); | 139 | cpu_relax(); |
165 | } | 140 | } |
166 | EXPORT_SYMBOL_GPL(irq_work_sync); | 141 | EXPORT_SYMBOL_GPL(irq_work_sync); |
diff --git a/kernel/sched.c b/kernel/sched.c index 03ad0113801a..d87c6e5d4e8c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
200 | { | 200 | { |
201 | ktime_t now; | 201 | unsigned long delta; |
202 | ktime_t soft, hard, now; | ||
203 | |||
204 | for (;;) { | ||
205 | if (hrtimer_active(period_timer)) | ||
206 | break; | ||
207 | |||
208 | now = hrtimer_cb_get_time(period_timer); | ||
209 | hrtimer_forward(period_timer, now, period); | ||
202 | 210 | ||
211 | soft = hrtimer_get_softexpires(period_timer); | ||
212 | hard = hrtimer_get_expires(period_timer); | ||
213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
220 | { | ||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
204 | return; | 222 | return; |
205 | 223 | ||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
207 | return; | 225 | return; |
208 | 226 | ||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
211 | unsigned long delta; | ||
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | ||
216 | |||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
219 | |||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
225 | } | ||
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
227 | } | 230 | } |
228 | 231 | ||
@@ -247,6 +250,24 @@ struct cfs_rq; | |||
247 | 250 | ||
248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
249 | 252 | ||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
250 | /* task group related information */ | 271 | /* task group related information */ |
251 | struct task_group { | 272 | struct task_group { |
252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
@@ -278,6 +299,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
280 | #endif | 301 | #endif |
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 304 | }; |
282 | 305 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
312 | struct cfs_rq { | 335 | struct cfs_rq { |
313 | struct load_weight load; | 336 | struct load_weight load; |
314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
315 | 338 | ||
316 | u64 exec_clock; | 339 | u64 exec_clock; |
317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
@@ -377,9 +400,120 @@ struct cfs_rq { | |||
377 | 400 | ||
378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
379 | #endif | 402 | #endif |
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
380 | #endif | 412 | #endif |
381 | }; | 413 | }; |
382 | 414 | ||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 518 | struct rt_rq { |
385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
@@ -510,7 +644,7 @@ struct rq { | |||
510 | 644 | ||
511 | unsigned long cpu_power; | 645 | unsigned long cpu_power; |
512 | 646 | ||
513 | unsigned char idle_at_tick; | 647 | unsigned char idle_balance; |
514 | /* For active balancing */ | 648 | /* For active balancing */ |
515 | int post_schedule; | 649 | int post_schedule; |
516 | int active_balance; | 650 | int active_balance; |
@@ -520,8 +654,6 @@ struct rq { | |||
520 | int cpu; | 654 | int cpu; |
521 | int online; | 655 | int online; |
522 | 656 | ||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | 657 | u64 rt_avg; |
526 | u64 age_stamp; | 658 | u64 age_stamp; |
527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
@@ -570,7 +702,7 @@ struct rq { | |||
570 | #endif | 702 | #endif |
571 | 703 | ||
572 | #ifdef CONFIG_SMP | 704 | #ifdef CONFIG_SMP |
573 | struct task_struct *wake_list; | 705 | struct llist_head wake_list; |
574 | #endif | 706 | #endif |
575 | }; | 707 | }; |
576 | 708 | ||
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) | |||
1272 | smp_send_reschedule(cpu); | 1404 | smp_send_reschedule(cpu); |
1273 | } | 1405 | } |
1274 | 1406 | ||
1407 | static inline bool got_nohz_idle_kick(void) | ||
1408 | { | ||
1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | ||
1410 | } | ||
1411 | |||
1412 | #else /* CONFIG_NO_HZ */ | ||
1413 | |||
1414 | static inline bool got_nohz_idle_kick(void) | ||
1415 | { | ||
1416 | return false; | ||
1417 | } | ||
1418 | |||
1275 | #endif /* CONFIG_NO_HZ */ | 1419 | #endif /* CONFIG_NO_HZ */ |
1276 | 1420 | ||
1277 | static u64 sched_avg_period(void) | 1421 | static u64 sched_avg_period(void) |
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1471 | update_load_sub(&rq->load, load); | 1615 | update_load_sub(&rq->load, load); |
1472 | } | 1616 | } |
1473 | 1617 | ||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1620 | typedef int (*tg_visitor)(struct task_group *, void *); |
1476 | 1621 | ||
1477 | /* | 1622 | /* |
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1479 | * leaving it for the final time. | 1624 | * node and @up when leaving it for the final time. |
1625 | * | ||
1626 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1480 | */ | 1627 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1628 | static int walk_tg_tree_from(struct task_group *from, |
1629 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 1630 | { |
1483 | struct task_group *parent, *child; | 1631 | struct task_group *parent, *child; |
1484 | int ret; | 1632 | int ret; |
1485 | 1633 | ||
1486 | rcu_read_lock(); | 1634 | parent = from; |
1487 | parent = &root_task_group; | 1635 | |
1488 | down: | 1636 | down: |
1489 | ret = (*down)(parent, data); | 1637 | ret = (*down)(parent, data); |
1490 | if (ret) | 1638 | if (ret) |
1491 | goto out_unlock; | 1639 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1640 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 1641 | parent = child; |
1494 | goto down; | 1642 | goto down; |
@@ -1497,19 +1645,29 @@ up: | |||
1497 | continue; | 1645 | continue; |
1498 | } | 1646 | } |
1499 | ret = (*up)(parent, data); | 1647 | ret = (*up)(parent, data); |
1500 | if (ret) | 1648 | if (ret || parent == from) |
1501 | goto out_unlock; | 1649 | goto out; |
1502 | 1650 | ||
1503 | child = parent; | 1651 | child = parent; |
1504 | parent = parent->parent; | 1652 | parent = parent->parent; |
1505 | if (parent) | 1653 | if (parent) |
1506 | goto up; | 1654 | goto up; |
1507 | out_unlock: | 1655 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 1656 | return ret; |
1511 | } | 1657 | } |
1512 | 1658 | ||
1659 | /* | ||
1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1661 | * leaving it for the final time. | ||
1662 | * | ||
1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1664 | */ | ||
1665 | |||
1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1667 | { | ||
1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1669 | } | ||
1670 | |||
1513 | static int tg_nop(struct task_group *tg, void *data) | 1671 | static int tg_nop(struct task_group *tg, void *data) |
1514 | { | 1672 | { |
1515 | return 0; | 1673 | return 0; |
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1570 | 1728 | ||
1571 | if (nr_running) | 1729 | if (nr_running) |
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1730 | return rq->load.weight / nr_running; |
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | 1731 | ||
1576 | return rq->avg_load_per_task; | 1732 | return 0; |
1577 | } | 1733 | } |
1578 | 1734 | ||
1579 | #ifdef CONFIG_PREEMPT | 1735 | #ifdef CONFIG_PREEMPT |
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1806 | rq->nr_uninterruptible--; | 1962 | rq->nr_uninterruptible--; |
1807 | 1963 | ||
1808 | enqueue_task(rq, p, flags); | 1964 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 1965 | } |
1811 | 1966 | ||
1812 | /* | 1967 | /* |
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1818 | rq->nr_uninterruptible++; | 1973 | rq->nr_uninterruptible++; |
1819 | 1974 | ||
1820 | dequeue_task(rq, p, flags); | 1975 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 1976 | } |
1823 | 1977 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1978 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2390 | 2544 | ||
2391 | /* Look for allowed, online CPU in same node. */ | 2545 | /* Look for allowed, online CPU in same node. */ |
2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2546 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2547 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
2394 | return dest_cpu; | 2548 | return dest_cpu; |
2395 | 2549 | ||
2396 | /* Any allowed, online CPU? */ | 2550 | /* Any allowed, online CPU? */ |
2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2551 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
2398 | if (dest_cpu < nr_cpu_ids) | 2552 | if (dest_cpu < nr_cpu_ids) |
2399 | return dest_cpu; | 2553 | return dest_cpu; |
2400 | 2554 | ||
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2585 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2432 | * not worry about this generic constraint ] | 2586 | * not worry about this generic constraint ] |
2433 | */ | 2587 | */ |
2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2588 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
2435 | !cpu_online(cpu))) | 2589 | !cpu_online(cpu))) |
2436 | cpu = select_fallback_rq(task_cpu(p), p); | 2590 | cpu = select_fallback_rq(task_cpu(p), p); |
2437 | 2591 | ||
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
2556 | } | 2710 | } |
2557 | 2711 | ||
2558 | #ifdef CONFIG_SMP | 2712 | #ifdef CONFIG_SMP |
2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 2713 | static void sched_ttwu_pending(void) |
2560 | { | 2714 | { |
2561 | struct rq *rq = this_rq(); | 2715 | struct rq *rq = this_rq(); |
2716 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
2717 | struct task_struct *p; | ||
2562 | 2718 | ||
2563 | raw_spin_lock(&rq->lock); | 2719 | raw_spin_lock(&rq->lock); |
2564 | 2720 | ||
2565 | while (list) { | 2721 | while (llist) { |
2566 | struct task_struct *p = list; | 2722 | p = llist_entry(llist, struct task_struct, wake_entry); |
2567 | list = list->wake_entry; | 2723 | llist = llist_next(llist); |
2568 | ttwu_do_activate(rq, p, 0); | 2724 | ttwu_do_activate(rq, p, 0); |
2569 | } | 2725 | } |
2570 | 2726 | ||
2571 | raw_spin_unlock(&rq->lock); | 2727 | raw_spin_unlock(&rq->lock); |
2572 | } | 2728 | } |
2573 | 2729 | ||
2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
2575 | |||
2576 | static void sched_ttwu_pending(void) | ||
2577 | { | ||
2578 | struct rq *rq = this_rq(); | ||
2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2580 | |||
2581 | if (!list) | ||
2582 | return; | ||
2583 | |||
2584 | sched_ttwu_do_pending(list); | ||
2585 | } | ||
2586 | |||
2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2588 | |||
2589 | void scheduler_ipi(void) | 2730 | void scheduler_ipi(void) |
2590 | { | 2731 | { |
2591 | struct rq *rq = this_rq(); | 2732 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2593 | |||
2594 | if (!list) | ||
2595 | return; | 2733 | return; |
2596 | 2734 | ||
2597 | /* | 2735 | /* |
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void) | |||
2608 | * somewhat pessimize the simple resched case. | 2746 | * somewhat pessimize the simple resched case. |
2609 | */ | 2747 | */ |
2610 | irq_enter(); | 2748 | irq_enter(); |
2611 | sched_ttwu_do_pending(list); | 2749 | sched_ttwu_pending(); |
2750 | |||
2751 | /* | ||
2752 | * Check if someone kicked us for doing the nohz idle load balance. | ||
2753 | */ | ||
2754 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
2755 | this_rq()->idle_balance = 1; | ||
2756 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
2757 | } | ||
2612 | irq_exit(); | 2758 | irq_exit(); |
2613 | } | 2759 | } |
2614 | 2760 | ||
2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2761 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2616 | { | 2762 | { |
2617 | struct rq *rq = cpu_rq(cpu); | 2763 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
2618 | struct task_struct *next = rq->wake_list; | ||
2619 | |||
2620 | for (;;) { | ||
2621 | struct task_struct *old = next; | ||
2622 | |||
2623 | p->wake_entry = next; | ||
2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
2625 | if (next == old) | ||
2626 | break; | ||
2627 | } | ||
2628 | |||
2629 | if (!next) | ||
2630 | smp_send_reschedule(cpu); | 2764 | smp_send_reschedule(cpu); |
2631 | } | 2765 | } |
2632 | 2766 | ||
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 2982 | p->state = TASK_RUNNING; |
2849 | 2983 | ||
2850 | /* | 2984 | /* |
2985 | * Make sure we do not leak PI boosting priority to the child. | ||
2986 | */ | ||
2987 | p->prio = current->normal_prio; | ||
2988 | |||
2989 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 2990 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 2991 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 2992 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2993 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 2994 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 2995 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 2996 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 2997 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 2998 | p->static_prio = NICE_TO_PRIO(0); |
2999 | |||
3000 | p->prio = p->normal_prio = __normal_prio(p); | ||
3001 | set_load_weight(p); | ||
2864 | 3002 | ||
2865 | /* | 3003 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 3004 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 3007 | p->sched_reset_on_fork = 0; |
2870 | } | 3008 | } |
2871 | 3009 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 3010 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 3011 | p->sched_class = &fair_sched_class; |
2879 | 3012 | ||
@@ -4116,7 +4249,7 @@ void scheduler_tick(void) | |||
4116 | perf_event_task_tick(); | 4249 | perf_event_task_tick(); |
4117 | 4250 | ||
4118 | #ifdef CONFIG_SMP | 4251 | #ifdef CONFIG_SMP |
4119 | rq->idle_at_tick = idle_cpu(cpu); | 4252 | rq->idle_balance = idle_cpu(cpu); |
4120 | trigger_load_balance(rq, cpu); | 4253 | trigger_load_balance(rq, cpu); |
4121 | #endif | 4254 | #endif |
4122 | } | 4255 | } |
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq) | |||
4240 | * Optimization: we know that if all tasks are in | 4373 | * Optimization: we know that if all tasks are in |
4241 | * the fair class we can call that function directly: | 4374 | * the fair class we can call that function directly: |
4242 | */ | 4375 | */ |
4243 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4376 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4244 | p = fair_sched_class.pick_next_task(rq); | 4377 | p = fair_sched_class.pick_next_task(rq); |
4245 | if (likely(p)) | 4378 | if (likely(p)) |
4246 | return p; | 4379 | return p; |
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice); | |||
5026 | */ | 5159 | */ |
5027 | int idle_cpu(int cpu) | 5160 | int idle_cpu(int cpu) |
5028 | { | 5161 | { |
5029 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 5162 | struct rq *rq = cpu_rq(cpu); |
5163 | |||
5164 | if (rq->curr != rq->idle) | ||
5165 | return 0; | ||
5166 | |||
5167 | if (rq->nr_running) | ||
5168 | return 0; | ||
5169 | |||
5170 | #ifdef CONFIG_SMP | ||
5171 | if (!llist_empty(&rq->wake_list)) | ||
5172 | return 0; | ||
5173 | #endif | ||
5174 | |||
5175 | return 1; | ||
5030 | } | 5176 | } |
5031 | 5177 | ||
5032 | /** | 5178 | /** |
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter) | |||
5876 | printk(KERN_INFO | 6022 | printk(KERN_INFO |
5877 | " task PC stack pid father\n"); | 6023 | " task PC stack pid father\n"); |
5878 | #endif | 6024 | #endif |
5879 | read_lock(&tasklist_lock); | 6025 | rcu_read_lock(); |
5880 | do_each_thread(g, p) { | 6026 | do_each_thread(g, p) { |
5881 | /* | 6027 | /* |
5882 | * reset the NMI-timeout, listing all files on a slow | 6028 | * reset the NMI-timeout, listing all files on a slow |
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter) | |||
5892 | #ifdef CONFIG_SCHED_DEBUG | 6038 | #ifdef CONFIG_SCHED_DEBUG |
5893 | sysrq_sched_debug_show(); | 6039 | sysrq_sched_debug_show(); |
5894 | #endif | 6040 | #endif |
5895 | read_unlock(&tasklist_lock); | 6041 | rcu_read_unlock(); |
5896 | /* | 6042 | /* |
5897 | * Only show locks if all tasks are dumped: | 6043 | * Only show locks if all tasks are dumped: |
5898 | */ | 6044 | */ |
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
6007 | { | 6153 | { |
6008 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 6154 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
6009 | p->sched_class->set_cpus_allowed(p, new_mask); | 6155 | p->sched_class->set_cpus_allowed(p, new_mask); |
6010 | else { | 6156 | |
6011 | cpumask_copy(&p->cpus_allowed, new_mask); | 6157 | cpumask_copy(&p->cpus_allowed, new_mask); |
6012 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 6158 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
6013 | } | ||
6014 | } | 6159 | } |
6015 | 6160 | ||
6016 | /* | 6161 | /* |
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6108 | if (task_cpu(p) != src_cpu) | 6253 | if (task_cpu(p) != src_cpu) |
6109 | goto done; | 6254 | goto done; |
6110 | /* Affinity changed (again). */ | 6255 | /* Affinity changed (again). */ |
6111 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6256 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
6112 | goto fail; | 6257 | goto fail; |
6113 | 6258 | ||
6114 | /* | 6259 | /* |
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
6189 | rq->calc_load_active = 0; | 6334 | rq->calc_load_active = 0; |
6190 | } | 6335 | } |
6191 | 6336 | ||
6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6339 | { | ||
6340 | struct cfs_rq *cfs_rq; | ||
6341 | |||
6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6344 | |||
6345 | if (!cfs_rq->runtime_enabled) | ||
6346 | continue; | ||
6347 | |||
6348 | /* | ||
6349 | * clock_task is not advancing so we just need to make sure | ||
6350 | * there's some valid quota amount | ||
6351 | */ | ||
6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6353 | if (cfs_rq_throttled(cfs_rq)) | ||
6354 | unthrottle_cfs_rq(cfs_rq); | ||
6355 | } | ||
6356 | } | ||
6357 | #else | ||
6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6359 | #endif | ||
6360 | |||
6192 | /* | 6361 | /* |
6193 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6194 | * try_to_wake_up()->select_task_rq(). | 6363 | * try_to_wake_up()->select_task_rq(). |
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6214 | */ | 6383 | */ |
6215 | rq->stop = NULL; | 6384 | rq->stop = NULL; |
6216 | 6385 | ||
6386 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
6387 | unthrottle_offline_cfs_rqs(rq); | ||
6388 | |||
6217 | for ( ; ; ) { | 6389 | for ( ; ; ) { |
6218 | /* | 6390 | /* |
6219 | * There's this thread running, bail when that's the only | 6391 | * There's this thread running, bail when that's the only |
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7957 | /* allow initial update_cfs_load() to truncate */ | 8129 | /* allow initial update_cfs_load() to truncate */ |
7958 | cfs_rq->load_stamp = 1; | 8130 | cfs_rq->load_stamp = 1; |
7959 | #endif | 8131 | #endif |
8132 | init_cfs_rq_runtime(cfs_rq); | ||
7960 | 8133 | ||
7961 | tg->cfs_rq[cpu] = cfs_rq; | 8134 | tg->cfs_rq[cpu] = cfs_rq; |
7962 | tg->se[cpu] = se; | 8135 | tg->se[cpu] = se; |
@@ -8096,6 +8269,7 @@ void __init sched_init(void) | |||
8096 | * We achieve this by letting root_task_group's tasks sit | 8269 | * We achieve this by letting root_task_group's tasks sit |
8097 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8270 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8098 | */ | 8271 | */ |
8272 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8099 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8273 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8100 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8274 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8101 | 8275 | ||
@@ -8125,7 +8299,6 @@ void __init sched_init(void) | |||
8125 | rq_attach_root(rq, &def_root_domain); | 8299 | rq_attach_root(rq, &def_root_domain); |
8126 | #ifdef CONFIG_NO_HZ | 8300 | #ifdef CONFIG_NO_HZ |
8127 | rq->nohz_balance_kick = 0; | 8301 | rq->nohz_balance_kick = 0; |
8128 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8129 | #endif | 8302 | #endif |
8130 | #endif | 8303 | #endif |
8131 | init_rq_hrtick(rq); | 8304 | init_rq_hrtick(rq); |
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8336 | { | 8509 | { |
8337 | int i; | 8510 | int i; |
8338 | 8511 | ||
8512 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8513 | |||
8339 | for_each_possible_cpu(i) { | 8514 | for_each_possible_cpu(i) { |
8340 | if (tg->cfs_rq) | 8515 | if (tg->cfs_rq) |
8341 | kfree(tg->cfs_rq[i]); | 8516 | kfree(tg->cfs_rq[i]); |
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8363 | 8538 | ||
8364 | tg->shares = NICE_0_LOAD; | 8539 | tg->shares = NICE_0_LOAD; |
8365 | 8540 | ||
8541 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8542 | |||
8366 | for_each_possible_cpu(i) { | 8543 | for_each_possible_cpu(i) { |
8367 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8544 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8368 | GFP_KERNEL, cpu_to_node(i)); | 8545 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8638 | } | 8815 | } |
8639 | #endif | 8816 | #endif |
8640 | 8817 | ||
8641 | #ifdef CONFIG_RT_GROUP_SCHED | 8818 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8642 | /* | ||
8643 | * Ensure that the real time constraints are schedulable. | ||
8644 | */ | ||
8645 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8646 | |||
8647 | static unsigned long to_ratio(u64 period, u64 runtime) | 8819 | static unsigned long to_ratio(u64 period, u64 runtime) |
8648 | { | 8820 | { |
8649 | if (runtime == RUNTIME_INF) | 8821 | if (runtime == RUNTIME_INF) |
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8651 | 8823 | ||
8652 | return div64_u64(runtime << 20, period); | 8824 | return div64_u64(runtime << 20, period); |
8653 | } | 8825 | } |
8826 | #endif | ||
8827 | |||
8828 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8829 | /* | ||
8830 | * Ensure that the real time constraints are schedulable. | ||
8831 | */ | ||
8832 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8654 | 8833 | ||
8655 | /* Must be called with tasklist_lock held */ | 8834 | /* Must be called with tasklist_lock held */ |
8656 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8835 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data { | |||
8671 | u64 rt_runtime; | 8850 | u64 rt_runtime; |
8672 | }; | 8851 | }; |
8673 | 8852 | ||
8674 | static int tg_schedulable(struct task_group *tg, void *data) | 8853 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8675 | { | 8854 | { |
8676 | struct rt_schedulable_data *d = data; | 8855 | struct rt_schedulable_data *d = data; |
8677 | struct task_group *child; | 8856 | struct task_group *child; |
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8729 | 8908 | ||
8730 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8909 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8731 | { | 8910 | { |
8911 | int ret; | ||
8912 | |||
8732 | struct rt_schedulable_data data = { | 8913 | struct rt_schedulable_data data = { |
8733 | .tg = tg, | 8914 | .tg = tg, |
8734 | .rt_period = period, | 8915 | .rt_period = period, |
8735 | .rt_runtime = runtime, | 8916 | .rt_runtime = runtime, |
8736 | }; | 8917 | }; |
8737 | 8918 | ||
8738 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8919 | rcu_read_lock(); |
8920 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
8921 | rcu_read_unlock(); | ||
8922 | |||
8923 | return ret; | ||
8739 | } | 8924 | } |
8740 | 8925 | ||
8741 | static int tg_set_bandwidth(struct task_group *tg, | 8926 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8742 | u64 rt_period, u64 rt_runtime) | 8927 | u64 rt_period, u64 rt_runtime) |
8743 | { | 8928 | { |
8744 | int i, err = 0; | 8929 | int i, err = 0; |
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8777 | if (rt_runtime_us < 0) | 8962 | if (rt_runtime_us < 0) |
8778 | rt_runtime = RUNTIME_INF; | 8963 | rt_runtime = RUNTIME_INF; |
8779 | 8964 | ||
8780 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8965 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8781 | } | 8966 | } |
8782 | 8967 | ||
8783 | long sched_group_rt_runtime(struct task_group *tg) | 8968 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8802 | if (rt_period == 0) | 8987 | if (rt_period == 0) |
8803 | return -EINVAL; | 8988 | return -EINVAL; |
8804 | 8989 | ||
8805 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8990 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8806 | } | 8991 | } |
8807 | 8992 | ||
8808 | long sched_group_rt_period(struct task_group *tg) | 8993 | long sched_group_rt_period(struct task_group *tg) |
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8992 | 9177 | ||
8993 | return (u64) scale_load_down(tg->shares); | 9178 | return (u64) scale_load_down(tg->shares); |
8994 | } | 9179 | } |
9180 | |||
9181 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9182 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
9183 | |||
9184 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9185 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9186 | |||
9187 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
9188 | |||
9189 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9190 | { | ||
9191 | int i, ret = 0, runtime_enabled; | ||
9192 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9193 | |||
9194 | if (tg == &root_task_group) | ||
9195 | return -EINVAL; | ||
9196 | |||
9197 | /* | ||
9198 | * Ensure we have at some amount of bandwidth every period. This is | ||
9199 | * to prevent reaching a state of large arrears when throttled via | ||
9200 | * entity_tick() resulting in prolonged exit starvation. | ||
9201 | */ | ||
9202 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9203 | return -EINVAL; | ||
9204 | |||
9205 | /* | ||
9206 | * Likewise, bound things on the otherside by preventing insane quota | ||
9207 | * periods. This also allows us to normalize in computing quota | ||
9208 | * feasibility. | ||
9209 | */ | ||
9210 | if (period > max_cfs_quota_period) | ||
9211 | return -EINVAL; | ||
9212 | |||
9213 | mutex_lock(&cfs_constraints_mutex); | ||
9214 | ret = __cfs_schedulable(tg, period, quota); | ||
9215 | if (ret) | ||
9216 | goto out_unlock; | ||
9217 | |||
9218 | runtime_enabled = quota != RUNTIME_INF; | ||
9219 | raw_spin_lock_irq(&cfs_b->lock); | ||
9220 | cfs_b->period = ns_to_ktime(period); | ||
9221 | cfs_b->quota = quota; | ||
9222 | |||
9223 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
9224 | /* restart the period timer (if active) to handle new period expiry */ | ||
9225 | if (runtime_enabled && cfs_b->timer_active) { | ||
9226 | /* force a reprogram */ | ||
9227 | cfs_b->timer_active = 0; | ||
9228 | __start_cfs_bandwidth(cfs_b); | ||
9229 | } | ||
9230 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9231 | |||
9232 | for_each_possible_cpu(i) { | ||
9233 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9234 | struct rq *rq = rq_of(cfs_rq); | ||
9235 | |||
9236 | raw_spin_lock_irq(&rq->lock); | ||
9237 | cfs_rq->runtime_enabled = runtime_enabled; | ||
9238 | cfs_rq->runtime_remaining = 0; | ||
9239 | |||
9240 | if (cfs_rq_throttled(cfs_rq)) | ||
9241 | unthrottle_cfs_rq(cfs_rq); | ||
9242 | raw_spin_unlock_irq(&rq->lock); | ||
9243 | } | ||
9244 | out_unlock: | ||
9245 | mutex_unlock(&cfs_constraints_mutex); | ||
9246 | |||
9247 | return ret; | ||
9248 | } | ||
9249 | |||
9250 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9251 | { | ||
9252 | u64 quota, period; | ||
9253 | |||
9254 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9255 | if (cfs_quota_us < 0) | ||
9256 | quota = RUNTIME_INF; | ||
9257 | else | ||
9258 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9259 | |||
9260 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9261 | } | ||
9262 | |||
9263 | long tg_get_cfs_quota(struct task_group *tg) | ||
9264 | { | ||
9265 | u64 quota_us; | ||
9266 | |||
9267 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9268 | return -1; | ||
9269 | |||
9270 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9271 | do_div(quota_us, NSEC_PER_USEC); | ||
9272 | |||
9273 | return quota_us; | ||
9274 | } | ||
9275 | |||
9276 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9277 | { | ||
9278 | u64 quota, period; | ||
9279 | |||
9280 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9281 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9282 | |||
9283 | if (period <= 0) | ||
9284 | return -EINVAL; | ||
9285 | |||
9286 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9287 | } | ||
9288 | |||
9289 | long tg_get_cfs_period(struct task_group *tg) | ||
9290 | { | ||
9291 | u64 cfs_period_us; | ||
9292 | |||
9293 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9294 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9295 | |||
9296 | return cfs_period_us; | ||
9297 | } | ||
9298 | |||
9299 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9300 | { | ||
9301 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9302 | } | ||
9303 | |||
9304 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9305 | s64 cfs_quota_us) | ||
9306 | { | ||
9307 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9308 | } | ||
9309 | |||
9310 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9311 | { | ||
9312 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9313 | } | ||
9314 | |||
9315 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9316 | u64 cfs_period_us) | ||
9317 | { | ||
9318 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9319 | } | ||
9320 | |||
9321 | struct cfs_schedulable_data { | ||
9322 | struct task_group *tg; | ||
9323 | u64 period, quota; | ||
9324 | }; | ||
9325 | |||
9326 | /* | ||
9327 | * normalize group quota/period to be quota/max_period | ||
9328 | * note: units are usecs | ||
9329 | */ | ||
9330 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
9331 | struct cfs_schedulable_data *d) | ||
9332 | { | ||
9333 | u64 quota, period; | ||
9334 | |||
9335 | if (tg == d->tg) { | ||
9336 | period = d->period; | ||
9337 | quota = d->quota; | ||
9338 | } else { | ||
9339 | period = tg_get_cfs_period(tg); | ||
9340 | quota = tg_get_cfs_quota(tg); | ||
9341 | } | ||
9342 | |||
9343 | /* note: these should typically be equivalent */ | ||
9344 | if (quota == RUNTIME_INF || quota == -1) | ||
9345 | return RUNTIME_INF; | ||
9346 | |||
9347 | return to_ratio(period, quota); | ||
9348 | } | ||
9349 | |||
9350 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
9351 | { | ||
9352 | struct cfs_schedulable_data *d = data; | ||
9353 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9354 | s64 quota = 0, parent_quota = -1; | ||
9355 | |||
9356 | if (!tg->parent) { | ||
9357 | quota = RUNTIME_INF; | ||
9358 | } else { | ||
9359 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
9360 | |||
9361 | quota = normalize_cfs_quota(tg, d); | ||
9362 | parent_quota = parent_b->hierarchal_quota; | ||
9363 | |||
9364 | /* | ||
9365 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
9366 | * limit is set | ||
9367 | */ | ||
9368 | if (quota == RUNTIME_INF) | ||
9369 | quota = parent_quota; | ||
9370 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
9371 | return -EINVAL; | ||
9372 | } | ||
9373 | cfs_b->hierarchal_quota = quota; | ||
9374 | |||
9375 | return 0; | ||
9376 | } | ||
9377 | |||
9378 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
9379 | { | ||
9380 | int ret; | ||
9381 | struct cfs_schedulable_data data = { | ||
9382 | .tg = tg, | ||
9383 | .period = period, | ||
9384 | .quota = quota, | ||
9385 | }; | ||
9386 | |||
9387 | if (quota != RUNTIME_INF) { | ||
9388 | do_div(data.period, NSEC_PER_USEC); | ||
9389 | do_div(data.quota, NSEC_PER_USEC); | ||
9390 | } | ||
9391 | |||
9392 | rcu_read_lock(); | ||
9393 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
9394 | rcu_read_unlock(); | ||
9395 | |||
9396 | return ret; | ||
9397 | } | ||
9398 | |||
9399 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
9400 | struct cgroup_map_cb *cb) | ||
9401 | { | ||
9402 | struct task_group *tg = cgroup_tg(cgrp); | ||
9403 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9404 | |||
9405 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
9406 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
9407 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
9408 | |||
9409 | return 0; | ||
9410 | } | ||
9411 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
8995 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9412 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8996 | 9413 | ||
8997 | #ifdef CONFIG_RT_GROUP_SCHED | 9414 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = { | |||
9026 | .write_u64 = cpu_shares_write_u64, | 9443 | .write_u64 = cpu_shares_write_u64, |
9027 | }, | 9444 | }, |
9028 | #endif | 9445 | #endif |
9446 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9447 | { | ||
9448 | .name = "cfs_quota_us", | ||
9449 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9450 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9451 | }, | ||
9452 | { | ||
9453 | .name = "cfs_period_us", | ||
9454 | .read_u64 = cpu_cfs_period_read_u64, | ||
9455 | .write_u64 = cpu_cfs_period_write_u64, | ||
9456 | }, | ||
9457 | { | ||
9458 | .name = "stat", | ||
9459 | .read_map = cpu_stats_show, | ||
9460 | }, | ||
9461 | #endif | ||
9029 | #ifdef CONFIG_RT_GROUP_SCHED | 9462 | #ifdef CONFIG_RT_GROUP_SCHED |
9030 | { | 9463 | { |
9031 | .name = "rt_runtime_us", | 9464 | .name = "rt_runtime_us", |
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9335 | .subsys_id = cpuacct_subsys_id, | 9768 | .subsys_id = cpuacct_subsys_id, |
9336 | }; | 9769 | }; |
9337 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9770 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9338 | |||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -47,9 +47,6 @@ static int convert_prio(int prio) | |||
47 | return cpupri; | 47 | return cpupri; |
48 | } | 48 | } |
49 | 49 | ||
50 | #define for_each_cpupri_active(array, idx) \ | ||
51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) | ||
52 | |||
53 | /** | 50 | /** |
54 | * cpupri_find - find the best (lowest-pri) CPU in the system | 51 | * cpupri_find - find the best (lowest-pri) CPU in the system |
55 | * @cp: The cpupri context | 52 | * @cp: The cpupri context |
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
71 | int idx = 0; | 68 | int idx = 0; |
72 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
73 | 70 | ||
74 | for_each_cpupri_active(cp->pri_active, idx) { | 71 | if (task_pri >= MAX_RT_PRIO) |
75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 72 | return 0; |
76 | 73 | ||
77 | if (idx >= task_pri) | 74 | for (idx = 0; idx < task_pri; idx++) { |
78 | break; | 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
76 | int skip = 0; | ||
77 | |||
78 | if (!atomic_read(&(vec)->count)) | ||
79 | skip = 1; | ||
80 | /* | ||
81 | * When looking at the vector, we need to read the counter, | ||
82 | * do a memory barrier, then read the mask. | ||
83 | * | ||
84 | * Note: This is still all racey, but we can deal with it. | ||
85 | * Ideally, we only want to look at masks that are set. | ||
86 | * | ||
87 | * If a mask is not set, then the only thing wrong is that we | ||
88 | * did a little more work than necessary. | ||
89 | * | ||
90 | * If we read a zero count but the mask is set, because of the | ||
91 | * memory barriers, that can only happen when the highest prio | ||
92 | * task for a run queue has left the run queue, in which case, | ||
93 | * it will be followed by a pull. If the task we are processing | ||
94 | * fails to find a proper place to go, that pull request will | ||
95 | * pull this task if the run queue is running at a lower | ||
96 | * priority. | ||
97 | */ | ||
98 | smp_rmb(); | ||
99 | |||
100 | /* Need to do the rmb for every iteration */ | ||
101 | if (skip) | ||
102 | continue; | ||
79 | 103 | ||
80 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 104 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
81 | continue; | 105 | continue; |
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
115 | { | 139 | { |
116 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
117 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
118 | unsigned long flags; | 142 | int do_mb = 0; |
119 | 143 | ||
120 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
121 | 145 | ||
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
128 | * If the cpu was currently mapped to a different value, we | 152 | * If the cpu was currently mapped to a different value, we |
129 | * need to map it to the new value then remove the old value. | 153 | * need to map it to the new value then remove the old value. |
130 | * Note, we must add the new value first, otherwise we risk the | 154 | * Note, we must add the new value first, otherwise we risk the |
131 | * cpu being cleared from pri_active, and this cpu could be | 155 | * cpu being missed by the priority loop in cpupri_find. |
132 | * missed for a push or pull. | ||
133 | */ | 156 | */ |
134 | if (likely(newpri != CPUPRI_INVALID)) { | 157 | if (likely(newpri != CPUPRI_INVALID)) { |
135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 158 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
136 | 159 | ||
137 | raw_spin_lock_irqsave(&vec->lock, flags); | ||
138 | |||
139 | cpumask_set_cpu(cpu, vec->mask); | 160 | cpumask_set_cpu(cpu, vec->mask); |
140 | vec->count++; | 161 | /* |
141 | if (vec->count == 1) | 162 | * When adding a new vector, we update the mask first, |
142 | set_bit(newpri, cp->pri_active); | 163 | * do a write memory barrier, and then update the count, to |
143 | 164 | * make sure the vector is visible when count is set. | |
144 | raw_spin_unlock_irqrestore(&vec->lock, flags); | 165 | */ |
166 | smp_mb__before_atomic_inc(); | ||
167 | atomic_inc(&(vec)->count); | ||
168 | do_mb = 1; | ||
145 | } | 169 | } |
146 | if (likely(oldpri != CPUPRI_INVALID)) { | 170 | if (likely(oldpri != CPUPRI_INVALID)) { |
147 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | 171 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; |
148 | 172 | ||
149 | raw_spin_lock_irqsave(&vec->lock, flags); | 173 | /* |
150 | 174 | * Because the order of modification of the vec->count | |
151 | vec->count--; | 175 | * is important, we must make sure that the update |
152 | if (!vec->count) | 176 | * of the new prio is seen before we decrement the |
153 | clear_bit(oldpri, cp->pri_active); | 177 | * old prio. This makes sure that the loop sees |
178 | * one or the other when we raise the priority of | ||
179 | * the run queue. We don't care about when we lower the | ||
180 | * priority, as that will trigger an rt pull anyway. | ||
181 | * | ||
182 | * We only need to do a memory barrier if we updated | ||
183 | * the new priority vec. | ||
184 | */ | ||
185 | if (do_mb) | ||
186 | smp_mb__after_atomic_inc(); | ||
187 | |||
188 | /* | ||
189 | * When removing from the vector, we decrement the counter first | ||
190 | * do a memory barrier and then clear the mask. | ||
191 | */ | ||
192 | atomic_dec(&(vec)->count); | ||
193 | smp_mb__after_atomic_inc(); | ||
154 | cpumask_clear_cpu(cpu, vec->mask); | 194 | cpumask_clear_cpu(cpu, vec->mask); |
155 | |||
156 | raw_spin_unlock_irqrestore(&vec->lock, flags); | ||
157 | } | 195 | } |
158 | 196 | ||
159 | *currpri = newpri; | 197 | *currpri = newpri; |
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) | |||
175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 213 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
176 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | 214 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; |
177 | 215 | ||
178 | raw_spin_lock_init(&vec->lock); | 216 | atomic_set(&vec->count, 0); |
179 | vec->count = 0; | ||
180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 217 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
181 | goto cleanup; | 218 | goto cleanup; |
182 | } | 219 | } |
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | 5 | ||
6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
8 | 7 | ||
9 | #define CPUPRI_INVALID -1 | 8 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 9 | #define CPUPRI_IDLE 0 |
@@ -12,14 +11,12 @@ | |||
12 | /* values 2-101 are RT priorities 0-99 */ | 11 | /* values 2-101 are RT priorities 0-99 */ |
13 | 12 | ||
14 | struct cpupri_vec { | 13 | struct cpupri_vec { |
15 | raw_spinlock_t lock; | 14 | atomic_t count; |
16 | int count; | 15 | cpumask_var_t mask; |
17 | cpumask_var_t mask; | ||
18 | }; | 16 | }; |
19 | 17 | ||
20 | struct cpupri { | 18 | struct cpupri { |
21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
23 | int cpu_to_pri[NR_CPUS]; | 20 | int cpu_to_pri[NR_CPUS]; |
24 | }; | 21 | }; |
25 | 22 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee9993814..5c9e67923b7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
89 | */ | 89 | */ |
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; |
91 | 91 | ||
92 | #ifdef CONFIG_CFS_BANDWIDTH | ||
93 | /* | ||
94 | * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool | ||
95 | * each time a cfs_rq requests quota. | ||
96 | * | ||
97 | * Note: in the case that the slice exceeds the runtime remaining (either due | ||
98 | * to consumption or the quota being specified to be smaller than the slice) | ||
99 | * we will always only issue the remaining available time. | ||
100 | * | ||
101 | * default: 5 msec, units: microseconds | ||
102 | */ | ||
103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | ||
104 | #endif | ||
105 | |||
92 | static const struct sched_class fair_sched_class; | 106 | static const struct sched_class fair_sched_class; |
93 | 107 | ||
94 | /************************************************************** | 108 | /************************************************************** |
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
292 | 306 | ||
293 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 307 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
294 | 308 | ||
309 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
310 | unsigned long delta_exec); | ||
295 | 311 | ||
296 | /************************************************************** | 312 | /************************************************************** |
297 | * Scheduling class tree data structure manipulation methods: | 313 | * Scheduling class tree data structure manipulation methods: |
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
583 | cpuacct_charge(curtask, delta_exec); | 599 | cpuacct_charge(curtask, delta_exec); |
584 | account_group_exec_runtime(curtask, delta_exec); | 600 | account_group_exec_runtime(curtask, delta_exec); |
585 | } | 601 | } |
602 | |||
603 | account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
586 | } | 604 | } |
587 | 605 | ||
588 | static inline void | 606 | static inline void |
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
688 | } | 706 | } |
689 | 707 | ||
690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 708 | #ifdef CONFIG_FAIR_GROUP_SCHED |
709 | /* we need this in update_cfs_load and load-balance functions below */ | ||
710 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
691 | # ifdef CONFIG_SMP | 711 | # ifdef CONFIG_SMP |
692 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 712 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
693 | int global_update) | 713 | int global_update) |
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
710 | u64 now, delta; | 730 | u64 now, delta; |
711 | unsigned long load = cfs_rq->load.weight; | 731 | unsigned long load = cfs_rq->load.weight; |
712 | 732 | ||
713 | if (cfs_rq->tg == &root_task_group) | 733 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) |
714 | return; | 734 | return; |
715 | 735 | ||
716 | now = rq_of(cfs_rq)->clock_task; | 736 | now = rq_of(cfs_rq)->clock_task; |
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
819 | 839 | ||
820 | tg = cfs_rq->tg; | 840 | tg = cfs_rq->tg; |
821 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 841 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
822 | if (!se) | 842 | if (!se || throttled_hierarchy(cfs_rq)) |
823 | return; | 843 | return; |
824 | #ifndef CONFIG_SMP | 844 | #ifndef CONFIG_SMP |
825 | if (likely(se->load.weight == tg->shares)) | 845 | if (likely(se->load.weight == tg->shares)) |
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
950 | se->vruntime = vruntime; | 970 | se->vruntime = vruntime; |
951 | } | 971 | } |
952 | 972 | ||
973 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||
974 | |||
953 | static void | 975 | static void |
954 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 976 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
955 | { | 977 | { |
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
979 | __enqueue_entity(cfs_rq, se); | 1001 | __enqueue_entity(cfs_rq, se); |
980 | se->on_rq = 1; | 1002 | se->on_rq = 1; |
981 | 1003 | ||
982 | if (cfs_rq->nr_running == 1) | 1004 | if (cfs_rq->nr_running == 1) { |
983 | list_add_leaf_cfs_rq(cfs_rq); | 1005 | list_add_leaf_cfs_rq(cfs_rq); |
1006 | check_enqueue_throttle(cfs_rq); | ||
1007 | } | ||
984 | } | 1008 | } |
985 | 1009 | ||
986 | static void __clear_buddies_last(struct sched_entity *se) | 1010 | static void __clear_buddies_last(struct sched_entity *se) |
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1028 | __clear_buddies_skip(se); | 1052 | __clear_buddies_skip(se); |
1029 | } | 1053 | } |
1030 | 1054 | ||
1055 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1056 | |||
1031 | static void | 1057 | static void |
1032 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1058 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
1033 | { | 1059 | { |
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1066 | if (!(flags & DEQUEUE_SLEEP)) | 1092 | if (!(flags & DEQUEUE_SLEEP)) |
1067 | se->vruntime -= cfs_rq->min_vruntime; | 1093 | se->vruntime -= cfs_rq->min_vruntime; |
1068 | 1094 | ||
1095 | /* return excess runtime on last dequeue */ | ||
1096 | return_cfs_rq_runtime(cfs_rq); | ||
1097 | |||
1069 | update_min_vruntime(cfs_rq); | 1098 | update_min_vruntime(cfs_rq); |
1070 | update_cfs_shares(cfs_rq); | 1099 | update_cfs_shares(cfs_rq); |
1071 | } | 1100 | } |
@@ -1077,6 +1106,8 @@ static void | |||
1077 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 1106 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
1078 | { | 1107 | { |
1079 | unsigned long ideal_runtime, delta_exec; | 1108 | unsigned long ideal_runtime, delta_exec; |
1109 | struct sched_entity *se; | ||
1110 | s64 delta; | ||
1080 | 1111 | ||
1081 | ideal_runtime = sched_slice(cfs_rq, curr); | 1112 | ideal_runtime = sched_slice(cfs_rq, curr); |
1082 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 1113 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1095 | * narrow margin doesn't have to wait for a full slice. | 1126 | * narrow margin doesn't have to wait for a full slice. |
1096 | * This also mitigates buddy induced latencies under load. | 1127 | * This also mitigates buddy induced latencies under load. |
1097 | */ | 1128 | */ |
1098 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1099 | return; | ||
1100 | |||
1101 | if (delta_exec < sysctl_sched_min_granularity) | 1129 | if (delta_exec < sysctl_sched_min_granularity) |
1102 | return; | 1130 | return; |
1103 | 1131 | ||
1104 | if (cfs_rq->nr_running > 1) { | 1132 | se = __pick_first_entity(cfs_rq); |
1105 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 1133 | delta = curr->vruntime - se->vruntime; |
1106 | s64 delta = curr->vruntime - se->vruntime; | ||
1107 | 1134 | ||
1108 | if (delta < 0) | 1135 | if (delta < 0) |
1109 | return; | 1136 | return; |
1110 | 1137 | ||
1111 | if (delta > ideal_runtime) | 1138 | if (delta > ideal_runtime) |
1112 | resched_task(rq_of(cfs_rq)->curr); | 1139 | resched_task(rq_of(cfs_rq)->curr); |
1113 | } | ||
1114 | } | 1140 | } |
1115 | 1141 | ||
1116 | static void | 1142 | static void |
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1185 | return se; | 1211 | return se; |
1186 | } | 1212 | } |
1187 | 1213 | ||
1214 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1215 | |||
1188 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 1216 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
1189 | { | 1217 | { |
1190 | /* | 1218 | /* |
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1194 | if (prev->on_rq) | 1222 | if (prev->on_rq) |
1195 | update_curr(cfs_rq); | 1223 | update_curr(cfs_rq); |
1196 | 1224 | ||
1225 | /* throttle cfs_rqs exceeding runtime */ | ||
1226 | check_cfs_rq_runtime(cfs_rq); | ||
1227 | |||
1197 | check_spread(cfs_rq, prev); | 1228 | check_spread(cfs_rq, prev); |
1198 | if (prev->on_rq) { | 1229 | if (prev->on_rq) { |
1199 | update_stats_wait_start(cfs_rq, prev); | 1230 | update_stats_wait_start(cfs_rq, prev); |
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1233 | return; | 1264 | return; |
1234 | #endif | 1265 | #endif |
1235 | 1266 | ||
1236 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 1267 | if (cfs_rq->nr_running > 1) |
1237 | check_preempt_tick(cfs_rq, curr); | 1268 | check_preempt_tick(cfs_rq, curr); |
1238 | } | 1269 | } |
1239 | 1270 | ||
1271 | |||
1272 | /************************************************** | ||
1273 | * CFS bandwidth control machinery | ||
1274 | */ | ||
1275 | |||
1276 | #ifdef CONFIG_CFS_BANDWIDTH | ||
1277 | /* | ||
1278 | * default period for cfs group bandwidth. | ||
1279 | * default: 0.1s, units: nanoseconds | ||
1280 | */ | ||
1281 | static inline u64 default_cfs_period(void) | ||
1282 | { | ||
1283 | return 100000000ULL; | ||
1284 | } | ||
1285 | |||
1286 | static inline u64 sched_cfs_bandwidth_slice(void) | ||
1287 | { | ||
1288 | return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; | ||
1289 | } | ||
1290 | |||
1291 | /* | ||
1292 | * Replenish runtime according to assigned quota and update expiration time. | ||
1293 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | ||
1294 | * additional synchronization around rq->lock. | ||
1295 | * | ||
1296 | * requires cfs_b->lock | ||
1297 | */ | ||
1298 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | ||
1299 | { | ||
1300 | u64 now; | ||
1301 | |||
1302 | if (cfs_b->quota == RUNTIME_INF) | ||
1303 | return; | ||
1304 | |||
1305 | now = sched_clock_cpu(smp_processor_id()); | ||
1306 | cfs_b->runtime = cfs_b->quota; | ||
1307 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | ||
1308 | } | ||
1309 | |||
1310 | /* returns 0 on failure to allocate runtime */ | ||
1311 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1312 | { | ||
1313 | struct task_group *tg = cfs_rq->tg; | ||
1314 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
1315 | u64 amount = 0, min_amount, expires; | ||
1316 | |||
1317 | /* note: this is a positive sum as runtime_remaining <= 0 */ | ||
1318 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | ||
1319 | |||
1320 | raw_spin_lock(&cfs_b->lock); | ||
1321 | if (cfs_b->quota == RUNTIME_INF) | ||
1322 | amount = min_amount; | ||
1323 | else { | ||
1324 | /* | ||
1325 | * If the bandwidth pool has become inactive, then at least one | ||
1326 | * period must have elapsed since the last consumption. | ||
1327 | * Refresh the global state and ensure bandwidth timer becomes | ||
1328 | * active. | ||
1329 | */ | ||
1330 | if (!cfs_b->timer_active) { | ||
1331 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1332 | __start_cfs_bandwidth(cfs_b); | ||
1333 | } | ||
1334 | |||
1335 | if (cfs_b->runtime > 0) { | ||
1336 | amount = min(cfs_b->runtime, min_amount); | ||
1337 | cfs_b->runtime -= amount; | ||
1338 | cfs_b->idle = 0; | ||
1339 | } | ||
1340 | } | ||
1341 | expires = cfs_b->runtime_expires; | ||
1342 | raw_spin_unlock(&cfs_b->lock); | ||
1343 | |||
1344 | cfs_rq->runtime_remaining += amount; | ||
1345 | /* | ||
1346 | * we may have advanced our local expiration to account for allowed | ||
1347 | * spread between our sched_clock and the one on which runtime was | ||
1348 | * issued. | ||
1349 | */ | ||
1350 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | ||
1351 | cfs_rq->runtime_expires = expires; | ||
1352 | |||
1353 | return cfs_rq->runtime_remaining > 0; | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * Note: This depends on the synchronization provided by sched_clock and the | ||
1358 | * fact that rq->clock snapshots this value. | ||
1359 | */ | ||
1360 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1361 | { | ||
1362 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1363 | struct rq *rq = rq_of(cfs_rq); | ||
1364 | |||
1365 | /* if the deadline is ahead of our clock, nothing to do */ | ||
1366 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | ||
1367 | return; | ||
1368 | |||
1369 | if (cfs_rq->runtime_remaining < 0) | ||
1370 | return; | ||
1371 | |||
1372 | /* | ||
1373 | * If the local deadline has passed we have to consider the | ||
1374 | * possibility that our sched_clock is 'fast' and the global deadline | ||
1375 | * has not truly expired. | ||
1376 | * | ||
1377 | * Fortunately we can check determine whether this the case by checking | ||
1378 | * whether the global deadline has advanced. | ||
1379 | */ | ||
1380 | |||
1381 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | ||
1382 | /* extend local deadline, drift is bounded above by 2 ticks */ | ||
1383 | cfs_rq->runtime_expires += TICK_NSEC; | ||
1384 | } else { | ||
1385 | /* global deadline is ahead, expiration has passed */ | ||
1386 | cfs_rq->runtime_remaining = 0; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1391 | unsigned long delta_exec) | ||
1392 | { | ||
1393 | /* dock delta_exec before expiring quota (as it could span periods) */ | ||
1394 | cfs_rq->runtime_remaining -= delta_exec; | ||
1395 | expire_cfs_rq_runtime(cfs_rq); | ||
1396 | |||
1397 | if (likely(cfs_rq->runtime_remaining > 0)) | ||
1398 | return; | ||
1399 | |||
1400 | /* | ||
1401 | * if we're unable to extend our runtime we resched so that the active | ||
1402 | * hierarchy can be throttled | ||
1403 | */ | ||
1404 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | ||
1405 | resched_task(rq_of(cfs_rq)->curr); | ||
1406 | } | ||
1407 | |||
1408 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1409 | unsigned long delta_exec) | ||
1410 | { | ||
1411 | if (!cfs_rq->runtime_enabled) | ||
1412 | return; | ||
1413 | |||
1414 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
1415 | } | ||
1416 | |||
1417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1418 | { | ||
1419 | return cfs_rq->throttled; | ||
1420 | } | ||
1421 | |||
1422 | /* check whether cfs_rq, or any parent, is throttled */ | ||
1423 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1424 | { | ||
1425 | return cfs_rq->throttle_count; | ||
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Ensure that neither of the group entities corresponding to src_cpu or | ||
1430 | * dest_cpu are members of a throttled hierarchy when performing group | ||
1431 | * load-balance operations. | ||
1432 | */ | ||
1433 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1434 | int src_cpu, int dest_cpu) | ||
1435 | { | ||
1436 | struct cfs_rq *src_cfs_rq, *dest_cfs_rq; | ||
1437 | |||
1438 | src_cfs_rq = tg->cfs_rq[src_cpu]; | ||
1439 | dest_cfs_rq = tg->cfs_rq[dest_cpu]; | ||
1440 | |||
1441 | return throttled_hierarchy(src_cfs_rq) || | ||
1442 | throttled_hierarchy(dest_cfs_rq); | ||
1443 | } | ||
1444 | |||
1445 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
1446 | static int tg_unthrottle_up(struct task_group *tg, void *data) | ||
1447 | { | ||
1448 | struct rq *rq = data; | ||
1449 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1450 | |||
1451 | cfs_rq->throttle_count--; | ||
1452 | #ifdef CONFIG_SMP | ||
1453 | if (!cfs_rq->throttle_count) { | ||
1454 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | ||
1455 | |||
1456 | /* leaving throttled state, advance shares averaging windows */ | ||
1457 | cfs_rq->load_stamp += delta; | ||
1458 | cfs_rq->load_last += delta; | ||
1459 | |||
1460 | /* update entity weight now that we are on_rq again */ | ||
1461 | update_cfs_shares(cfs_rq); | ||
1462 | } | ||
1463 | #endif | ||
1464 | |||
1465 | return 0; | ||
1466 | } | ||
1467 | |||
1468 | static int tg_throttle_down(struct task_group *tg, void *data) | ||
1469 | { | ||
1470 | struct rq *rq = data; | ||
1471 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1472 | |||
1473 | /* group is entering throttled state, record last load */ | ||
1474 | if (!cfs_rq->throttle_count) | ||
1475 | update_cfs_load(cfs_rq, 0); | ||
1476 | cfs_rq->throttle_count++; | ||
1477 | |||
1478 | return 0; | ||
1479 | } | ||
1480 | |||
1481 | static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1482 | { | ||
1483 | struct rq *rq = rq_of(cfs_rq); | ||
1484 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1485 | struct sched_entity *se; | ||
1486 | long task_delta, dequeue = 1; | ||
1487 | |||
1488 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1489 | |||
1490 | /* account load preceding throttle */ | ||
1491 | rcu_read_lock(); | ||
1492 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | ||
1493 | rcu_read_unlock(); | ||
1494 | |||
1495 | task_delta = cfs_rq->h_nr_running; | ||
1496 | for_each_sched_entity(se) { | ||
1497 | struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||
1498 | /* throttled entity or throttle-on-deactivate */ | ||
1499 | if (!se->on_rq) | ||
1500 | break; | ||
1501 | |||
1502 | if (dequeue) | ||
1503 | dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||
1504 | qcfs_rq->h_nr_running -= task_delta; | ||
1505 | |||
1506 | if (qcfs_rq->load.weight) | ||
1507 | dequeue = 0; | ||
1508 | } | ||
1509 | |||
1510 | if (!se) | ||
1511 | rq->nr_running -= task_delta; | ||
1512 | |||
1513 | cfs_rq->throttled = 1; | ||
1514 | cfs_rq->throttled_timestamp = rq->clock; | ||
1515 | raw_spin_lock(&cfs_b->lock); | ||
1516 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
1517 | raw_spin_unlock(&cfs_b->lock); | ||
1518 | } | ||
1519 | |||
1520 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1521 | { | ||
1522 | struct rq *rq = rq_of(cfs_rq); | ||
1523 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1524 | struct sched_entity *se; | ||
1525 | int enqueue = 1; | ||
1526 | long task_delta; | ||
1527 | |||
1528 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1529 | |||
1530 | cfs_rq->throttled = 0; | ||
1531 | raw_spin_lock(&cfs_b->lock); | ||
1532 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | ||
1533 | list_del_rcu(&cfs_rq->throttled_list); | ||
1534 | raw_spin_unlock(&cfs_b->lock); | ||
1535 | cfs_rq->throttled_timestamp = 0; | ||
1536 | |||
1537 | update_rq_clock(rq); | ||
1538 | /* update hierarchical throttle state */ | ||
1539 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | ||
1540 | |||
1541 | if (!cfs_rq->load.weight) | ||
1542 | return; | ||
1543 | |||
1544 | task_delta = cfs_rq->h_nr_running; | ||
1545 | for_each_sched_entity(se) { | ||
1546 | if (se->on_rq) | ||
1547 | enqueue = 0; | ||
1548 | |||
1549 | cfs_rq = cfs_rq_of(se); | ||
1550 | if (enqueue) | ||
1551 | enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); | ||
1552 | cfs_rq->h_nr_running += task_delta; | ||
1553 | |||
1554 | if (cfs_rq_throttled(cfs_rq)) | ||
1555 | break; | ||
1556 | } | ||
1557 | |||
1558 | if (!se) | ||
1559 | rq->nr_running += task_delta; | ||
1560 | |||
1561 | /* determine whether we need to wake up potentially idle cpu */ | ||
1562 | if (rq->curr == rq->idle && rq->cfs.nr_running) | ||
1563 | resched_task(rq->curr); | ||
1564 | } | ||
1565 | |||
1566 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | ||
1567 | u64 remaining, u64 expires) | ||
1568 | { | ||
1569 | struct cfs_rq *cfs_rq; | ||
1570 | u64 runtime = remaining; | ||
1571 | |||
1572 | rcu_read_lock(); | ||
1573 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | ||
1574 | throttled_list) { | ||
1575 | struct rq *rq = rq_of(cfs_rq); | ||
1576 | |||
1577 | raw_spin_lock(&rq->lock); | ||
1578 | if (!cfs_rq_throttled(cfs_rq)) | ||
1579 | goto next; | ||
1580 | |||
1581 | runtime = -cfs_rq->runtime_remaining + 1; | ||
1582 | if (runtime > remaining) | ||
1583 | runtime = remaining; | ||
1584 | remaining -= runtime; | ||
1585 | |||
1586 | cfs_rq->runtime_remaining += runtime; | ||
1587 | cfs_rq->runtime_expires = expires; | ||
1588 | |||
1589 | /* we check whether we're throttled above */ | ||
1590 | if (cfs_rq->runtime_remaining > 0) | ||
1591 | unthrottle_cfs_rq(cfs_rq); | ||
1592 | |||
1593 | next: | ||
1594 | raw_spin_unlock(&rq->lock); | ||
1595 | |||
1596 | if (!remaining) | ||
1597 | break; | ||
1598 | } | ||
1599 | rcu_read_unlock(); | ||
1600 | |||
1601 | return remaining; | ||
1602 | } | ||
1603 | |||
1604 | /* | ||
1605 | * Responsible for refilling a task_group's bandwidth and unthrottling its | ||
1606 | * cfs_rqs as appropriate. If there has been no activity within the last | ||
1607 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | ||
1608 | * used to track this state. | ||
1609 | */ | ||
1610 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | ||
1611 | { | ||
1612 | u64 runtime, runtime_expires; | ||
1613 | int idle = 1, throttled; | ||
1614 | |||
1615 | raw_spin_lock(&cfs_b->lock); | ||
1616 | /* no need to continue the timer with no bandwidth constraint */ | ||
1617 | if (cfs_b->quota == RUNTIME_INF) | ||
1618 | goto out_unlock; | ||
1619 | |||
1620 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1621 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
1622 | idle = cfs_b->idle && !throttled; | ||
1623 | cfs_b->nr_periods += overrun; | ||
1624 | |||
1625 | /* if we're going inactive then everything else can be deferred */ | ||
1626 | if (idle) | ||
1627 | goto out_unlock; | ||
1628 | |||
1629 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1630 | |||
1631 | if (!throttled) { | ||
1632 | /* mark as potentially idle for the upcoming period */ | ||
1633 | cfs_b->idle = 1; | ||
1634 | goto out_unlock; | ||
1635 | } | ||
1636 | |||
1637 | /* account preceding periods in which throttling occurred */ | ||
1638 | cfs_b->nr_throttled += overrun; | ||
1639 | |||
1640 | /* | ||
1641 | * There are throttled entities so we must first use the new bandwidth | ||
1642 | * to unthrottle them before making it generally available. This | ||
1643 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
1644 | * allowed to run. | ||
1645 | */ | ||
1646 | runtime = cfs_b->runtime; | ||
1647 | runtime_expires = cfs_b->runtime_expires; | ||
1648 | cfs_b->runtime = 0; | ||
1649 | |||
1650 | /* | ||
1651 | * This check is repeated as we are holding onto the new bandwidth | ||
1652 | * while we unthrottle. This can potentially race with an unthrottled | ||
1653 | * group trying to acquire new bandwidth from the global pool. | ||
1654 | */ | ||
1655 | while (throttled && runtime > 0) { | ||
1656 | raw_spin_unlock(&cfs_b->lock); | ||
1657 | /* we can't nest cfs_b->lock while distributing bandwidth */ | ||
1658 | runtime = distribute_cfs_runtime(cfs_b, runtime, | ||
1659 | runtime_expires); | ||
1660 | raw_spin_lock(&cfs_b->lock); | ||
1661 | |||
1662 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1663 | } | ||
1664 | |||
1665 | /* return (any) remaining runtime */ | ||
1666 | cfs_b->runtime = runtime; | ||
1667 | /* | ||
1668 | * While we are ensured activity in the period following an | ||
1669 | * unthrottle, this also covers the case in which the new bandwidth is | ||
1670 | * insufficient to cover the existing bandwidth deficit. (Forcing the | ||
1671 | * timer to remain active while there are any throttled entities.) | ||
1672 | */ | ||
1673 | cfs_b->idle = 0; | ||
1674 | out_unlock: | ||
1675 | if (idle) | ||
1676 | cfs_b->timer_active = 0; | ||
1677 | raw_spin_unlock(&cfs_b->lock); | ||
1678 | |||
1679 | return idle; | ||
1680 | } | ||
1681 | |||
1682 | /* a cfs_rq won't donate quota below this amount */ | ||
1683 | static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; | ||
1684 | /* minimum remaining period time to redistribute slack quota */ | ||
1685 | static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | ||
1686 | /* how long we wait to gather additional slack before distributing */ | ||
1687 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | ||
1688 | |||
1689 | /* are we near the end of the current quota period? */ | ||
1690 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | ||
1691 | { | ||
1692 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | ||
1693 | u64 remaining; | ||
1694 | |||
1695 | /* if the call-back is running a quota refresh is already occurring */ | ||
1696 | if (hrtimer_callback_running(refresh_timer)) | ||
1697 | return 1; | ||
1698 | |||
1699 | /* is a quota refresh about to occur? */ | ||
1700 | remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); | ||
1701 | if (remaining < min_expire) | ||
1702 | return 1; | ||
1703 | |||
1704 | return 0; | ||
1705 | } | ||
1706 | |||
1707 | static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | ||
1708 | { | ||
1709 | u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; | ||
1710 | |||
1711 | /* if there's a quota refresh soon don't bother with slack */ | ||
1712 | if (runtime_refresh_within(cfs_b, min_left)) | ||
1713 | return; | ||
1714 | |||
1715 | start_bandwidth_timer(&cfs_b->slack_timer, | ||
1716 | ns_to_ktime(cfs_bandwidth_slack_period)); | ||
1717 | } | ||
1718 | |||
1719 | /* we know any runtime found here is valid as update_curr() precedes return */ | ||
1720 | static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1721 | { | ||
1722 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1723 | s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; | ||
1724 | |||
1725 | if (slack_runtime <= 0) | ||
1726 | return; | ||
1727 | |||
1728 | raw_spin_lock(&cfs_b->lock); | ||
1729 | if (cfs_b->quota != RUNTIME_INF && | ||
1730 | cfs_rq->runtime_expires == cfs_b->runtime_expires) { | ||
1731 | cfs_b->runtime += slack_runtime; | ||
1732 | |||
1733 | /* we are under rq->lock, defer unthrottling using a timer */ | ||
1734 | if (cfs_b->runtime > sched_cfs_bandwidth_slice() && | ||
1735 | !list_empty(&cfs_b->throttled_cfs_rq)) | ||
1736 | start_cfs_slack_bandwidth(cfs_b); | ||
1737 | } | ||
1738 | raw_spin_unlock(&cfs_b->lock); | ||
1739 | |||
1740 | /* even if it's not valid for return we don't want to try again */ | ||
1741 | cfs_rq->runtime_remaining -= slack_runtime; | ||
1742 | } | ||
1743 | |||
1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1745 | { | ||
1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | ||
1747 | return; | ||
1748 | |||
1749 | __return_cfs_rq_runtime(cfs_rq); | ||
1750 | } | ||
1751 | |||
1752 | /* | ||
1753 | * This is done with a timer (instead of inline with bandwidth return) since | ||
1754 | * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. | ||
1755 | */ | ||
1756 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||
1757 | { | ||
1758 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | ||
1759 | u64 expires; | ||
1760 | |||
1761 | /* confirm we're still not at a refresh boundary */ | ||
1762 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | ||
1763 | return; | ||
1764 | |||
1765 | raw_spin_lock(&cfs_b->lock); | ||
1766 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | ||
1767 | runtime = cfs_b->runtime; | ||
1768 | cfs_b->runtime = 0; | ||
1769 | } | ||
1770 | expires = cfs_b->runtime_expires; | ||
1771 | raw_spin_unlock(&cfs_b->lock); | ||
1772 | |||
1773 | if (!runtime) | ||
1774 | return; | ||
1775 | |||
1776 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | ||
1777 | |||
1778 | raw_spin_lock(&cfs_b->lock); | ||
1779 | if (expires == cfs_b->runtime_expires) | ||
1780 | cfs_b->runtime = runtime; | ||
1781 | raw_spin_unlock(&cfs_b->lock); | ||
1782 | } | ||
1783 | |||
1784 | /* | ||
1785 | * When a group wakes up we want to make sure that its quota is not already | ||
1786 | * expired/exceeded, otherwise it may be allowed to steal additional ticks of | ||
1787 | * runtime as update_curr() throttling can not not trigger until it's on-rq. | ||
1788 | */ | ||
1789 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | ||
1790 | { | ||
1791 | /* an active group must be handled by the update_curr()->put() path */ | ||
1792 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | ||
1793 | return; | ||
1794 | |||
1795 | /* ensure the group is not already throttled */ | ||
1796 | if (cfs_rq_throttled(cfs_rq)) | ||
1797 | return; | ||
1798 | |||
1799 | /* update runtime allocation */ | ||
1800 | account_cfs_rq_runtime(cfs_rq, 0); | ||
1801 | if (cfs_rq->runtime_remaining <= 0) | ||
1802 | throttle_cfs_rq(cfs_rq); | ||
1803 | } | ||
1804 | |||
1805 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | ||
1806 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1807 | { | ||
1808 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | ||
1809 | return; | ||
1810 | |||
1811 | /* | ||
1812 | * it's possible for a throttled entity to be forced into a running | ||
1813 | * state (e.g. set_curr_task), in this case we're finished. | ||
1814 | */ | ||
1815 | if (cfs_rq_throttled(cfs_rq)) | ||
1816 | return; | ||
1817 | |||
1818 | throttle_cfs_rq(cfs_rq); | ||
1819 | } | ||
1820 | #else | ||
1821 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1822 | unsigned long delta_exec) {} | ||
1823 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1824 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | ||
1825 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1826 | |||
1827 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1828 | { | ||
1829 | return 0; | ||
1830 | } | ||
1831 | |||
1832 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1833 | { | ||
1834 | return 0; | ||
1835 | } | ||
1836 | |||
1837 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1838 | int src_cpu, int dest_cpu) | ||
1839 | { | ||
1840 | return 0; | ||
1841 | } | ||
1842 | #endif | ||
1843 | |||
1240 | /************************************************** | 1844 | /************************************************** |
1241 | * CFS operations on tasks: | 1845 | * CFS operations on tasks: |
1242 | */ | 1846 | */ |
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1313 | break; | 1917 | break; |
1314 | cfs_rq = cfs_rq_of(se); | 1918 | cfs_rq = cfs_rq_of(se); |
1315 | enqueue_entity(cfs_rq, se, flags); | 1919 | enqueue_entity(cfs_rq, se, flags); |
1920 | |||
1921 | /* | ||
1922 | * end evaluation on encountering a throttled cfs_rq | ||
1923 | * | ||
1924 | * note: in the case of encountering a throttled cfs_rq we will | ||
1925 | * post the final h_nr_running increment below. | ||
1926 | */ | ||
1927 | if (cfs_rq_throttled(cfs_rq)) | ||
1928 | break; | ||
1929 | cfs_rq->h_nr_running++; | ||
1930 | |||
1316 | flags = ENQUEUE_WAKEUP; | 1931 | flags = ENQUEUE_WAKEUP; |
1317 | } | 1932 | } |
1318 | 1933 | ||
1319 | for_each_sched_entity(se) { | 1934 | for_each_sched_entity(se) { |
1320 | cfs_rq = cfs_rq_of(se); | 1935 | cfs_rq = cfs_rq_of(se); |
1936 | cfs_rq->h_nr_running++; | ||
1937 | |||
1938 | if (cfs_rq_throttled(cfs_rq)) | ||
1939 | break; | ||
1321 | 1940 | ||
1322 | update_cfs_load(cfs_rq, 0); | 1941 | update_cfs_load(cfs_rq, 0); |
1323 | update_cfs_shares(cfs_rq); | 1942 | update_cfs_shares(cfs_rq); |
1324 | } | 1943 | } |
1325 | 1944 | ||
1945 | if (!se) | ||
1946 | inc_nr_running(rq); | ||
1326 | hrtick_update(rq); | 1947 | hrtick_update(rq); |
1327 | } | 1948 | } |
1328 | 1949 | ||
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1343 | cfs_rq = cfs_rq_of(se); | 1964 | cfs_rq = cfs_rq_of(se); |
1344 | dequeue_entity(cfs_rq, se, flags); | 1965 | dequeue_entity(cfs_rq, se, flags); |
1345 | 1966 | ||
1967 | /* | ||
1968 | * end evaluation on encountering a throttled cfs_rq | ||
1969 | * | ||
1970 | * note: in the case of encountering a throttled cfs_rq we will | ||
1971 | * post the final h_nr_running decrement below. | ||
1972 | */ | ||
1973 | if (cfs_rq_throttled(cfs_rq)) | ||
1974 | break; | ||
1975 | cfs_rq->h_nr_running--; | ||
1976 | |||
1346 | /* Don't dequeue parent if it has other entities besides us */ | 1977 | /* Don't dequeue parent if it has other entities besides us */ |
1347 | if (cfs_rq->load.weight) { | 1978 | if (cfs_rq->load.weight) { |
1348 | /* | 1979 | /* |
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1361 | 1992 | ||
1362 | for_each_sched_entity(se) { | 1993 | for_each_sched_entity(se) { |
1363 | cfs_rq = cfs_rq_of(se); | 1994 | cfs_rq = cfs_rq_of(se); |
1995 | cfs_rq->h_nr_running--; | ||
1996 | |||
1997 | if (cfs_rq_throttled(cfs_rq)) | ||
1998 | break; | ||
1364 | 1999 | ||
1365 | update_cfs_load(cfs_rq, 0); | 2000 | update_cfs_load(cfs_rq, 0); |
1366 | update_cfs_shares(cfs_rq); | 2001 | update_cfs_shares(cfs_rq); |
1367 | } | 2002 | } |
1368 | 2003 | ||
2004 | if (!se) | ||
2005 | dec_nr_running(rq); | ||
1369 | hrtick_update(rq); | 2006 | hrtick_update(rq); |
1370 | } | 2007 | } |
1371 | 2008 | ||
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
1434 | 2071 | ||
1435 | return wl; | 2072 | return wl; |
1436 | } | 2073 | } |
1437 | |||
1438 | #else | 2074 | #else |
1439 | 2075 | ||
1440 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 2076 | static inline unsigned long effective_load(struct task_group *tg, int cpu, |
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1547 | 2183 | ||
1548 | /* Skip over this group if it has no CPUs allowed */ | 2184 | /* Skip over this group if it has no CPUs allowed */ |
1549 | if (!cpumask_intersects(sched_group_cpus(group), | 2185 | if (!cpumask_intersects(sched_group_cpus(group), |
1550 | &p->cpus_allowed)) | 2186 | tsk_cpus_allowed(p))) |
1551 | continue; | 2187 | continue; |
1552 | 2188 | ||
1553 | local_group = cpumask_test_cpu(this_cpu, | 2189 | local_group = cpumask_test_cpu(this_cpu, |
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1593 | int i; | 2229 | int i; |
1594 | 2230 | ||
1595 | /* Traverse only the allowed CPUs */ | 2231 | /* Traverse only the allowed CPUs */ |
1596 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | 2232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
1597 | load = weighted_cpuload(i); | 2233 | load = weighted_cpuload(i); |
1598 | 2234 | ||
1599 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2235 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1637 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 2273 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1638 | break; | 2274 | break; |
1639 | 2275 | ||
1640 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | 2276 | for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { |
1641 | if (idle_cpu(i)) { | 2277 | if (idle_cpu(i)) { |
1642 | target = i; | 2278 | target = i; |
1643 | break; | 2279 | break; |
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
1680 | int sync = wake_flags & WF_SYNC; | 2316 | int sync = wake_flags & WF_SYNC; |
1681 | 2317 | ||
1682 | if (sd_flag & SD_BALANCE_WAKE) { | 2318 | if (sd_flag & SD_BALANCE_WAKE) { |
1683 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) | 2319 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
1684 | want_affine = 1; | 2320 | want_affine = 1; |
1685 | new_cpu = prev_cpu; | 2321 | new_cpu = prev_cpu; |
1686 | } | 2322 | } |
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1875 | if (unlikely(se == pse)) | 2511 | if (unlikely(se == pse)) |
1876 | return; | 2512 | return; |
1877 | 2513 | ||
2514 | /* | ||
2515 | * This is possible from callers such as pull_task(), in which we | ||
2516 | * unconditionally check_prempt_curr() after an enqueue (which may have | ||
2517 | * lead to a throttle). This both saves work and prevents false | ||
2518 | * next-buddy nomination below. | ||
2519 | */ | ||
2520 | if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) | ||
2521 | return; | ||
2522 | |||
1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { | 2523 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1879 | set_next_buddy(pse); | 2524 | set_next_buddy(pse); |
1880 | next_buddy_marked = 1; | 2525 | next_buddy_marked = 1; |
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1883 | /* | 2528 | /* |
1884 | * We can come here with TIF_NEED_RESCHED already set from new task | 2529 | * We can come here with TIF_NEED_RESCHED already set from new task |
1885 | * wake up path. | 2530 | * wake up path. |
2531 | * | ||
2532 | * Note: this also catches the edge-case of curr being in a throttled | ||
2533 | * group (e.g. via set_curr_task), since update_curr() (in the | ||
2534 | * enqueue of curr) will have resulted in resched being set. This | ||
2535 | * prevents us from potentially nominating it as a false LAST_BUDDY | ||
2536 | * below. | ||
1886 | */ | 2537 | */ |
1887 | if (test_tsk_need_resched(curr)) | 2538 | if (test_tsk_need_resched(curr)) |
1888 | return; | 2539 | return; |
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1899 | if (unlikely(p->policy != SCHED_NORMAL)) | 2550 | if (unlikely(p->policy != SCHED_NORMAL)) |
1900 | return; | 2551 | return; |
1901 | 2552 | ||
1902 | |||
1903 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1904 | return; | ||
1905 | |||
1906 | find_matching_se(&se, &pse); | 2553 | find_matching_se(&se, &pse); |
1907 | update_curr(cfs_rq_of(se)); | 2554 | update_curr(cfs_rq_of(se)); |
1908 | BUG_ON(!pse); | 2555 | BUG_ON(!pse); |
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
2005 | { | 2652 | { |
2006 | struct sched_entity *se = &p->se; | 2653 | struct sched_entity *se = &p->se; |
2007 | 2654 | ||
2008 | if (!se->on_rq) | 2655 | /* throttled hierarchies are not runnable */ |
2656 | if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) | ||
2009 | return false; | 2657 | return false; |
2010 | 2658 | ||
2011 | /* Tell the scheduler that we'd really like pse to run next. */ | 2659 | /* Tell the scheduler that we'd really like pse to run next. */ |
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2049 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2697 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2050 | * 3) are cache-hot on their current CPU. | 2698 | * 3) are cache-hot on their current CPU. |
2051 | */ | 2699 | */ |
2052 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | 2700 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { |
2053 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 2701 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
2054 | return 0; | 2702 | return 0; |
2055 | } | 2703 | } |
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2102 | 2750 | ||
2103 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 2751 | for_each_leaf_cfs_rq(busiest, cfs_rq) { |
2104 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 2752 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { |
2753 | if (throttled_lb_pair(task_group(p), | ||
2754 | busiest->cpu, this_cpu)) | ||
2755 | break; | ||
2105 | 2756 | ||
2106 | if (!can_migrate_task(p, busiest, this_cpu, | 2757 | if (!can_migrate_task(p, busiest, this_cpu, |
2107 | sd, idle, &pinned)) | 2758 | sd, idle, &pinned)) |
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu) | |||
2217 | * Iterates the task_group tree in a bottom up fashion, see | 2868 | * Iterates the task_group tree in a bottom up fashion, see |
2218 | * list_add_leaf_cfs_rq() for details. | 2869 | * list_add_leaf_cfs_rq() for details. |
2219 | */ | 2870 | */ |
2220 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2871 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
2872 | /* throttled entities do not contribute to load */ | ||
2873 | if (throttled_hierarchy(cfs_rq)) | ||
2874 | continue; | ||
2875 | |||
2221 | update_shares_cpu(cfs_rq->tg, cpu); | 2876 | update_shares_cpu(cfs_rq->tg, cpu); |
2877 | } | ||
2222 | rcu_read_unlock(); | 2878 | rcu_read_unlock(); |
2223 | } | 2879 | } |
2224 | 2880 | ||
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2268 | u64 rem_load, moved_load; | 2924 | u64 rem_load, moved_load; |
2269 | 2925 | ||
2270 | /* | 2926 | /* |
2271 | * empty group | 2927 | * empty group or part of a throttled hierarchy |
2272 | */ | 2928 | */ |
2273 | if (!busiest_cfs_rq->task_weight) | 2929 | if (!busiest_cfs_rq->task_weight || |
2930 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
2274 | continue; | 2931 | continue; |
2275 | 2932 | ||
2276 | rem_load = (u64)rem_load_move * busiest_weight; | 2933 | rem_load = (u64)rem_load_move * busiest_weight; |
@@ -3430,7 +4087,7 @@ redo: | |||
3430 | * moved to this_cpu | 4087 | * moved to this_cpu |
3431 | */ | 4088 | */ |
3432 | if (!cpumask_test_cpu(this_cpu, | 4089 | if (!cpumask_test_cpu(this_cpu, |
3433 | &busiest->curr->cpus_allowed)) { | 4090 | tsk_cpus_allowed(busiest->curr))) { |
3434 | raw_spin_unlock_irqrestore(&busiest->lock, | 4091 | raw_spin_unlock_irqrestore(&busiest->lock, |
3435 | flags); | 4092 | flags); |
3436 | all_pinned = 1; | 4093 | all_pinned = 1; |
@@ -3612,22 +4269,6 @@ out_unlock: | |||
3612 | } | 4269 | } |
3613 | 4270 | ||
3614 | #ifdef CONFIG_NO_HZ | 4271 | #ifdef CONFIG_NO_HZ |
3615 | |||
3616 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3617 | |||
3618 | static void trigger_sched_softirq(void *data) | ||
3619 | { | ||
3620 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3621 | } | ||
3622 | |||
3623 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3624 | { | ||
3625 | csd->func = trigger_sched_softirq; | ||
3626 | csd->info = NULL; | ||
3627 | csd->flags = 0; | ||
3628 | csd->priv = 0; | ||
3629 | } | ||
3630 | |||
3631 | /* | 4272 | /* |
3632 | * idle load balancing details | 4273 | * idle load balancing details |
3633 | * - One of the idle CPUs nominates itself as idle load_balancer, while | 4274 | * - One of the idle CPUs nominates itself as idle load_balancer, while |
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3667 | struct sched_domain *sd; | 4308 | struct sched_domain *sd; |
3668 | 4309 | ||
3669 | for_each_domain(cpu, sd) | 4310 | for_each_domain(cpu, sd) |
3670 | if (sd && (sd->flags & flag)) | 4311 | if (sd->flags & flag) |
3671 | break; | 4312 | break; |
3672 | 4313 | ||
3673 | return sd; | 4314 | return sd; |
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu) | |||
3793 | } | 4434 | } |
3794 | 4435 | ||
3795 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4436 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { |
3796 | struct call_single_data *cp; | ||
3797 | |||
3798 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4437 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; |
3799 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | 4438 | |
3800 | __smp_call_function_single(ilb_cpu, cp, 0); | 4439 | smp_mb(); |
4440 | /* | ||
4441 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
4442 | * This way we generate a sched IPI on the target cpu which | ||
4443 | * is idle. And the softirq performing nohz idle load balance | ||
4444 | * will be run before returning from the IPI. | ||
4445 | */ | ||
4446 | smp_send_reschedule(ilb_cpu); | ||
3801 | } | 4447 | } |
3802 | return; | 4448 | return; |
3803 | } | 4449 | } |
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
4030 | if (time_before(now, nohz.next_balance)) | 4676 | if (time_before(now, nohz.next_balance)) |
4031 | return 0; | 4677 | return 0; |
4032 | 4678 | ||
4033 | if (rq->idle_at_tick) | 4679 | if (idle_cpu(cpu)) |
4034 | return 0; | 4680 | return 0; |
4035 | 4681 | ||
4036 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 4682 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); |
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
4066 | { | 4712 | { |
4067 | int this_cpu = smp_processor_id(); | 4713 | int this_cpu = smp_processor_id(); |
4068 | struct rq *this_rq = cpu_rq(this_cpu); | 4714 | struct rq *this_rq = cpu_rq(this_cpu); |
4069 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 4715 | enum cpu_idle_type idle = this_rq->idle_balance ? |
4070 | CPU_IDLE : CPU_NOT_IDLE; | 4716 | CPU_IDLE : CPU_NOT_IDLE; |
4071 | 4717 | ||
4072 | rebalance_domains(this_cpu, idle); | 4718 | rebalance_domains(this_cpu, idle); |
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq) | |||
4251 | { | 4897 | { |
4252 | struct sched_entity *se = &rq->curr->se; | 4898 | struct sched_entity *se = &rq->curr->se; |
4253 | 4899 | ||
4254 | for_each_sched_entity(se) | 4900 | for_each_sched_entity(se) { |
4255 | set_next_entity(cfs_rq_of(se), se); | 4901 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4902 | |||
4903 | set_next_entity(cfs_rq, se); | ||
4904 | /* ensure bandwidth has been allocated on our new cfs_rq */ | ||
4905 | account_cfs_rq_runtime(cfs_rq, 0); | ||
4906 | } | ||
4256 | } | 4907 | } |
4257 | 4908 | ||
4258 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb040..efa0a7b75dde 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | |||
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, 1) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Should wakeups try to preempt running tasks. | ||
16 | */ | ||
17 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
18 | |||
19 | /* | ||
20 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
21 | * a newly woken task on the same cpu as the task that woke it -- | 16 | * a newly woken task on the same cpu as the task that woke it -- |
22 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index af1177858be3..056cbd2e2a27 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
124 | update_rt_migration(rt_rq); | 124 | update_rt_migration(rt_rq); |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline int has_pushable_tasks(struct rq *rq) | ||
128 | { | ||
129 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
130 | } | ||
131 | |||
127 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 132 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
128 | { | 133 | { |
129 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 134 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
130 | plist_node_init(&p->pushable_tasks, p->prio); | 135 | plist_node_init(&p->pushable_tasks, p->prio); |
131 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | |||
138 | /* Update the highest prio pushable task */ | ||
139 | if (p->prio < rq->rt.highest_prio.next) | ||
140 | rq->rt.highest_prio.next = p->prio; | ||
132 | } | 141 | } |
133 | 142 | ||
134 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | 143 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) |
135 | { | 144 | { |
136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 145 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | } | ||
138 | 146 | ||
139 | static inline int has_pushable_tasks(struct rq *rq) | 147 | /* Update the new highest prio pushable task */ |
140 | { | 148 | if (has_pushable_tasks(rq)) { |
141 | return !plist_head_empty(&rq->rt.pushable_tasks); | 149 | p = plist_first_entry(&rq->rt.pushable_tasks, |
150 | struct task_struct, pushable_tasks); | ||
151 | rq->rt.highest_prio.next = p->prio; | ||
152 | } else | ||
153 | rq->rt.highest_prio.next = MAX_RT_PRIO; | ||
142 | } | 154 | } |
143 | 155 | ||
144 | #else | 156 | #else |
@@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
643 | 655 | ||
644 | if (rt_rq->rt_time > runtime) { | 656 | if (rt_rq->rt_time > runtime) { |
645 | rt_rq->rt_throttled = 1; | 657 | rt_rq->rt_throttled = 1; |
658 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | ||
646 | if (rt_rq_throttled(rt_rq)) { | 659 | if (rt_rq_throttled(rt_rq)) { |
647 | sched_rt_rq_dequeue(rt_rq); | 660 | sched_rt_rq_dequeue(rt_rq); |
648 | return 1; | 661 | return 1; |
@@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq) | |||
698 | 711 | ||
699 | #if defined CONFIG_SMP | 712 | #if defined CONFIG_SMP |
700 | 713 | ||
701 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); | ||
702 | |||
703 | static inline int next_prio(struct rq *rq) | ||
704 | { | ||
705 | struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); | ||
706 | |||
707 | if (next && rt_prio(next->prio)) | ||
708 | return next->prio; | ||
709 | else | ||
710 | return MAX_RT_PRIO; | ||
711 | } | ||
712 | |||
713 | static void | 714 | static void |
714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | 715 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) |
715 | { | 716 | { |
716 | struct rq *rq = rq_of_rt_rq(rt_rq); | 717 | struct rq *rq = rq_of_rt_rq(rt_rq); |
717 | 718 | ||
718 | if (prio < prev_prio) { | 719 | if (rq->online && prio < prev_prio) |
719 | 720 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | |
720 | /* | ||
721 | * If the new task is higher in priority than anything on the | ||
722 | * run-queue, we know that the previous high becomes our | ||
723 | * next-highest. | ||
724 | */ | ||
725 | rt_rq->highest_prio.next = prev_prio; | ||
726 | |||
727 | if (rq->online) | ||
728 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | ||
729 | |||
730 | } else if (prio == rt_rq->highest_prio.curr) | ||
731 | /* | ||
732 | * If the next task is equal in priority to the highest on | ||
733 | * the run-queue, then we implicitly know that the next highest | ||
734 | * task cannot be any lower than current | ||
735 | */ | ||
736 | rt_rq->highest_prio.next = prio; | ||
737 | else if (prio < rt_rq->highest_prio.next) | ||
738 | /* | ||
739 | * Otherwise, we need to recompute next-highest | ||
740 | */ | ||
741 | rt_rq->highest_prio.next = next_prio(rq); | ||
742 | } | 721 | } |
743 | 722 | ||
744 | static void | 723 | static void |
@@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
746 | { | 725 | { |
747 | struct rq *rq = rq_of_rt_rq(rt_rq); | 726 | struct rq *rq = rq_of_rt_rq(rt_rq); |
748 | 727 | ||
749 | if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) | ||
750 | rt_rq->highest_prio.next = next_prio(rq); | ||
751 | |||
752 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 728 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
753 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 729 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
754 | } | 730 | } |
@@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
961 | 937 | ||
962 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 938 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
963 | enqueue_pushable_task(rq, p); | 939 | enqueue_pushable_task(rq, p); |
940 | |||
941 | inc_nr_running(rq); | ||
964 | } | 942 | } |
965 | 943 | ||
966 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 944 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
@@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
971 | dequeue_rt_entity(rt_se); | 949 | dequeue_rt_entity(rt_se); |
972 | 950 | ||
973 | dequeue_pushable_task(rq, p); | 951 | dequeue_pushable_task(rq, p); |
952 | |||
953 | dec_nr_running(rq); | ||
974 | } | 954 | } |
975 | 955 | ||
976 | /* | 956 | /* |
@@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1017 | struct rq *rq; | 997 | struct rq *rq; |
1018 | int cpu; | 998 | int cpu; |
1019 | 999 | ||
1020 | if (sd_flag != SD_BALANCE_WAKE) | ||
1021 | return smp_processor_id(); | ||
1022 | |||
1023 | cpu = task_cpu(p); | 1000 | cpu = task_cpu(p); |
1001 | |||
1002 | /* For anything but wake ups, just return the task_cpu */ | ||
1003 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
1004 | goto out; | ||
1005 | |||
1024 | rq = cpu_rq(cpu); | 1006 | rq = cpu_rq(cpu); |
1025 | 1007 | ||
1026 | rcu_read_lock(); | 1008 | rcu_read_lock(); |
@@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1059 | } | 1041 | } |
1060 | rcu_read_unlock(); | 1042 | rcu_read_unlock(); |
1061 | 1043 | ||
1044 | out: | ||
1062 | return cpu; | 1045 | return cpu; |
1063 | } | 1046 | } |
1064 | 1047 | ||
@@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
1178 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 1161 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
1179 | { | 1162 | { |
1180 | update_curr_rt(rq); | 1163 | update_curr_rt(rq); |
1181 | p->se.exec_start = 0; | ||
1182 | 1164 | ||
1183 | /* | 1165 | /* |
1184 | * The previous task needs to be made eligible for pushing | 1166 | * The previous task needs to be made eligible for pushing |
@@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | |||
1198 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1180 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1199 | { | 1181 | { |
1200 | if (!task_running(rq, p) && | 1182 | if (!task_running(rq, p) && |
1201 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1183 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1202 | (p->rt.nr_cpus_allowed > 1)) | 1184 | (p->rt.nr_cpus_allowed > 1)) |
1203 | return 1; | 1185 | return 1; |
1204 | return 0; | 1186 | return 0; |
@@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1343 | */ | 1325 | */ |
1344 | if (unlikely(task_rq(task) != rq || | 1326 | if (unlikely(task_rq(task) != rq || |
1345 | !cpumask_test_cpu(lowest_rq->cpu, | 1327 | !cpumask_test_cpu(lowest_rq->cpu, |
1346 | &task->cpus_allowed) || | 1328 | tsk_cpus_allowed(task)) || |
1347 | task_running(rq, task) || | 1329 | task_running(rq, task) || |
1348 | !task->on_rq)) { | 1330 | !task->on_rq)) { |
1349 | 1331 | ||
@@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq) | |||
1394 | { | 1376 | { |
1395 | struct task_struct *next_task; | 1377 | struct task_struct *next_task; |
1396 | struct rq *lowest_rq; | 1378 | struct rq *lowest_rq; |
1379 | int ret = 0; | ||
1397 | 1380 | ||
1398 | if (!rq->rt.overloaded) | 1381 | if (!rq->rt.overloaded) |
1399 | return 0; | 1382 | return 0; |
@@ -1426,7 +1409,7 @@ retry: | |||
1426 | if (!lowest_rq) { | 1409 | if (!lowest_rq) { |
1427 | struct task_struct *task; | 1410 | struct task_struct *task; |
1428 | /* | 1411 | /* |
1429 | * find lock_lowest_rq releases rq->lock | 1412 | * find_lock_lowest_rq releases rq->lock |
1430 | * so it is possible that next_task has migrated. | 1413 | * so it is possible that next_task has migrated. |
1431 | * | 1414 | * |
1432 | * We need to make sure that the task is still on the same | 1415 | * We need to make sure that the task is still on the same |
@@ -1436,12 +1419,11 @@ retry: | |||
1436 | task = pick_next_pushable_task(rq); | 1419 | task = pick_next_pushable_task(rq); |
1437 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1420 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1438 | /* | 1421 | /* |
1439 | * If we get here, the task hasn't moved at all, but | 1422 | * The task hasn't migrated, and is still the next |
1440 | * it has failed to push. We will not try again, | 1423 | * eligible task, but we failed to find a run-queue |
1441 | * since the other cpus will pull from us when they | 1424 | * to push it to. Do not retry in this case, since |
1442 | * are ready. | 1425 | * other cpus will pull from us when ready. |
1443 | */ | 1426 | */ |
1444 | dequeue_pushable_task(rq, next_task); | ||
1445 | goto out; | 1427 | goto out; |
1446 | } | 1428 | } |
1447 | 1429 | ||
@@ -1460,6 +1442,7 @@ retry: | |||
1460 | deactivate_task(rq, next_task, 0); | 1442 | deactivate_task(rq, next_task, 0); |
1461 | set_task_cpu(next_task, lowest_rq->cpu); | 1443 | set_task_cpu(next_task, lowest_rq->cpu); |
1462 | activate_task(lowest_rq, next_task, 0); | 1444 | activate_task(lowest_rq, next_task, 0); |
1445 | ret = 1; | ||
1463 | 1446 | ||
1464 | resched_task(lowest_rq->curr); | 1447 | resched_task(lowest_rq->curr); |
1465 | 1448 | ||
@@ -1468,7 +1451,7 @@ retry: | |||
1468 | out: | 1451 | out: |
1469 | put_task_struct(next_task); | 1452 | put_task_struct(next_task); |
1470 | 1453 | ||
1471 | return 1; | 1454 | return ret; |
1472 | } | 1455 | } |
1473 | 1456 | ||
1474 | static void push_rt_tasks(struct rq *rq) | 1457 | static void push_rt_tasks(struct rq *rq) |
@@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1626 | 1609 | ||
1627 | update_rt_migration(&rq->rt); | 1610 | update_rt_migration(&rq->rt); |
1628 | } | 1611 | } |
1629 | |||
1630 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
1631 | p->rt.nr_cpus_allowed = weight; | ||
1632 | } | 1612 | } |
1633 | 1613 | ||
1634 | /* Assumes rq->lock is held */ | 1614 | /* Assumes rq->lock is held */ |
@@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) | |||
1863 | rcu_read_unlock(); | 1843 | rcu_read_unlock(); |
1864 | } | 1844 | } |
1865 | #endif /* CONFIG_SCHED_DEBUG */ | 1845 | #endif /* CONFIG_SCHED_DEBUG */ |
1866 | |||
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
34 | static void | 34 | static void |
35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
36 | { | 36 | { |
37 | inc_nr_running(rq); | ||
37 | } | 38 | } |
38 | 39 | ||
39 | static void | 40 | static void |
40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
41 | { | 42 | { |
43 | dec_nr_running(rq); | ||
42 | } | 44 | } |
43 | 45 | ||
44 | static void yield_task_stop(struct rq *rq) | 46 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { | |||
379 | .extra2 = &one, | 379 | .extra2 = &one, |
380 | }, | 380 | }, |
381 | #endif | 381 | #endif |
382 | #ifdef CONFIG_CFS_BANDWIDTH | ||
383 | { | ||
384 | .procname = "sched_cfs_bandwidth_slice_us", | ||
385 | .data = &sysctl_sched_cfs_bandwidth_slice, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec_minmax, | ||
389 | .extra1 = &one, | ||
390 | }, | ||
391 | #endif | ||
382 | #ifdef CONFIG_PROVE_LOCKING | 392 | #ifdef CONFIG_PROVE_LOCKING |
383 | { | 393 | { |
384 | .procname = "prove_locking", | 394 | .procname = "prove_locking", |
diff --git a/lib/Kconfig b/lib/Kconfig index 6c695ff9caba..32f3e5ae2be5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig | |||
@@ -276,7 +276,4 @@ config CORDIC | |||
276 | so its calculations are in fixed point. Modules can select this | 276 | so its calculations are in fixed point. Modules can select this |
277 | when they require this function. Module will be called cordic. | 277 | when they require this function. Module will be called cordic. |
278 | 278 | ||
279 | config LLIST | ||
280 | bool | ||
281 | |||
282 | endmenu | 279 | endmenu |
diff --git a/lib/Makefile b/lib/Makefile index 3f5bc6d903e0..a4da283f5dc0 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -22,7 +22,7 @@ lib-y += kobject.o kref.o klist.o | |||
22 | obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ | 22 | obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ |
23 | bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ | 23 | bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ |
24 | string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ | 24 | string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ |
25 | bsearch.o find_last_bit.o find_next_bit.o | 25 | bsearch.o find_last_bit.o find_next_bit.o llist.o |
26 | obj-y += kstrtox.o | 26 | obj-y += kstrtox.o |
27 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o | 27 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o |
28 | 28 | ||
@@ -115,8 +115,6 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o | |||
115 | 115 | ||
116 | obj-$(CONFIG_CORDIC) += cordic.o | 116 | obj-$(CONFIG_CORDIC) += cordic.o |
117 | 117 | ||
118 | obj-$(CONFIG_LLIST) += llist.o | ||
119 | |||
120 | hostprogs-y := gen_crc32table | 118 | hostprogs-y := gen_crc32table |
121 | clean-files := crc32table.h | 119 | clean-files := crc32table.h |
122 | 120 | ||
diff --git a/lib/llist.c b/lib/llist.c index da445724fa1f..700cff77a387 100644 --- a/lib/llist.c +++ b/lib/llist.c | |||
@@ -3,8 +3,8 @@ | |||
3 | * | 3 | * |
4 | * The basic atomic operation of this list is cmpxchg on long. On | 4 | * The basic atomic operation of this list is cmpxchg on long. On |
5 | * architectures that don't have NMI-safe cmpxchg implementation, the | 5 | * architectures that don't have NMI-safe cmpxchg implementation, the |
6 | * list can NOT be used in NMI handler. So code uses the list in NMI | 6 | * list can NOT be used in NMI handlers. So code that uses the list in |
7 | * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. | 7 | * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. |
8 | * | 8 | * |
9 | * Copyright 2010,2011 Intel Corp. | 9 | * Copyright 2010,2011 Intel Corp. |
10 | * Author: Huang Ying <ying.huang@intel.com> | 10 | * Author: Huang Ying <ying.huang@intel.com> |
@@ -30,48 +30,28 @@ | |||
30 | #include <asm/system.h> | 30 | #include <asm/system.h> |
31 | 31 | ||
32 | /** | 32 | /** |
33 | * llist_add - add a new entry | ||
34 | * @new: new entry to be added | ||
35 | * @head: the head for your lock-less list | ||
36 | */ | ||
37 | void llist_add(struct llist_node *new, struct llist_head *head) | ||
38 | { | ||
39 | struct llist_node *entry, *old_entry; | ||
40 | |||
41 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
42 | BUG_ON(in_nmi()); | ||
43 | #endif | ||
44 | |||
45 | entry = head->first; | ||
46 | do { | ||
47 | old_entry = entry; | ||
48 | new->next = entry; | ||
49 | cpu_relax(); | ||
50 | } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry); | ||
51 | } | ||
52 | EXPORT_SYMBOL_GPL(llist_add); | ||
53 | |||
54 | /** | ||
55 | * llist_add_batch - add several linked entries in batch | 33 | * llist_add_batch - add several linked entries in batch |
56 | * @new_first: first entry in batch to be added | 34 | * @new_first: first entry in batch to be added |
57 | * @new_last: last entry in batch to be added | 35 | * @new_last: last entry in batch to be added |
58 | * @head: the head for your lock-less list | 36 | * @head: the head for your lock-less list |
37 | * | ||
38 | * Return whether list is empty before adding. | ||
59 | */ | 39 | */ |
60 | void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, | 40 | bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, |
61 | struct llist_head *head) | 41 | struct llist_head *head) |
62 | { | 42 | { |
63 | struct llist_node *entry, *old_entry; | 43 | struct llist_node *entry, *old_entry; |
64 | 44 | ||
65 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
66 | BUG_ON(in_nmi()); | ||
67 | #endif | ||
68 | |||
69 | entry = head->first; | 45 | entry = head->first; |
70 | do { | 46 | for (;;) { |
71 | old_entry = entry; | 47 | old_entry = entry; |
72 | new_last->next = entry; | 48 | new_last->next = entry; |
73 | cpu_relax(); | 49 | entry = cmpxchg(&head->first, old_entry, new_first); |
74 | } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); | 50 | if (entry == old_entry) |
51 | break; | ||
52 | } | ||
53 | |||
54 | return old_entry == NULL; | ||
75 | } | 55 | } |
76 | EXPORT_SYMBOL_GPL(llist_add_batch); | 56 | EXPORT_SYMBOL_GPL(llist_add_batch); |
77 | 57 | ||
@@ -93,37 +73,17 @@ struct llist_node *llist_del_first(struct llist_head *head) | |||
93 | { | 73 | { |
94 | struct llist_node *entry, *old_entry, *next; | 74 | struct llist_node *entry, *old_entry, *next; |
95 | 75 | ||
96 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
97 | BUG_ON(in_nmi()); | ||
98 | #endif | ||
99 | |||
100 | entry = head->first; | 76 | entry = head->first; |
101 | do { | 77 | for (;;) { |
102 | if (entry == NULL) | 78 | if (entry == NULL) |
103 | return NULL; | 79 | return NULL; |
104 | old_entry = entry; | 80 | old_entry = entry; |
105 | next = entry->next; | 81 | next = entry->next; |
106 | cpu_relax(); | 82 | entry = cmpxchg(&head->first, old_entry, next); |
107 | } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); | 83 | if (entry == old_entry) |
84 | break; | ||
85 | } | ||
108 | 86 | ||
109 | return entry; | 87 | return entry; |
110 | } | 88 | } |
111 | EXPORT_SYMBOL_GPL(llist_del_first); | 89 | EXPORT_SYMBOL_GPL(llist_del_first); |
112 | |||
113 | /** | ||
114 | * llist_del_all - delete all entries from lock-less list | ||
115 | * @head: the head of lock-less list to delete all entries | ||
116 | * | ||
117 | * If list is empty, return NULL, otherwise, delete all entries and | ||
118 | * return the pointer to the first entry. The order of entries | ||
119 | * deleted is from the newest to the oldest added one. | ||
120 | */ | ||
121 | struct llist_node *llist_del_all(struct llist_head *head) | ||
122 | { | ||
123 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
124 | BUG_ON(in_nmi()); | ||
125 | #endif | ||
126 | |||
127 | return xchg(&head->first, NULL); | ||
128 | } | ||
129 | EXPORT_SYMBOL_GPL(llist_del_all); | ||
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4689cb073da4..503f087382a4 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c | |||
@@ -22,7 +22,7 @@ notrace unsigned int debug_smp_processor_id(void) | |||
22 | * Kernel threads bound to a single CPU can safely use | 22 | * Kernel threads bound to a single CPU can safely use |
23 | * smp_processor_id(): | 23 | * smp_processor_id(): |
24 | */ | 24 | */ |
25 | if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) | 25 | if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu))) |
26 | goto out; | 26 | goto out; |
27 | 27 | ||
28 | /* | 28 | /* |