diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-26 11:08:43 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-26 11:08:43 -0400 |
| commit | 8a4a8918ed6e4a361f4df19f199bbc2d0a89a46c (patch) | |
| tree | d76974986aaaa8549baf2d6a106fa6cb60d64b88 | |
| parent | 8686a0e200419322654a75155e2e6f80346a1297 (diff) | |
| parent | 540f41edc15473ca3b2876de72646546ae101374 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits)
llist: Add back llist_add_batch() and llist_del_first() prototypes
sched: Don't use tasklist_lock for debug prints
sched: Warn on rt throttling
sched: Unify the ->cpus_allowed mask copy
sched: Wrap scheduler p->cpus_allowed access
sched: Request for idle balance during nohz idle load balance
sched: Use resched IPI to kick off the nohz idle balance
sched: Fix idle_cpu()
llist: Remove cpu_relax() usage in cmpxchg loops
sched: Convert to struct llist
llist: Add llist_next()
irq_work: Use llist in the struct irq_work logic
llist: Return whether list is empty before adding in llist_add()
llist: Move cpu_relax() to after the cmpxchg()
llist: Remove the platform-dependent NMI checks
llist: Make some llist functions inline
sched, tracing: Show PREEMPT_ACTIVE state in trace_sched_switch
sched: Remove redundant test in check_preempt_tick()
sched: Add documentation for bandwidth control
sched: Return unused runtime on group dequeue
...
| -rw-r--r-- | Documentation/scheduler/sched-bwc.txt | 122 | ||||
| -rw-r--r-- | drivers/acpi/apei/Kconfig | 1 | ||||
| -rw-r--r-- | include/linux/irq_work.h | 15 | ||||
| -rw-r--r-- | include/linux/llist.h | 77 | ||||
| -rw-r--r-- | include/linux/sched.h | 7 | ||||
| -rw-r--r-- | include/trace/events/sched.h | 9 | ||||
| -rw-r--r-- | init/Kconfig | 12 | ||||
| -rw-r--r-- | kernel/irq_work.c | 91 | ||||
| -rw-r--r-- | kernel/sched.c | 666 | ||||
| -rw-r--r-- | kernel/sched_cpupri.c | 89 | ||||
| -rw-r--r-- | kernel/sched_cpupri.h | 7 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 761 | ||||
| -rw-r--r-- | kernel/sched_features.h | 5 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 99 | ||||
| -rw-r--r-- | kernel/sched_stoptask.c | 2 | ||||
| -rw-r--r-- | kernel/sysctl.c | 10 | ||||
| -rw-r--r-- | lib/Kconfig | 3 | ||||
| -rw-r--r-- | lib/Makefile | 4 | ||||
| -rw-r--r-- | lib/llist.c | 74 | ||||
| -rw-r--r-- | lib/smp_processor_id.c | 2 |
20 files changed, 1646 insertions, 410 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt new file mode 100644 index 000000000000..f6b1873f68ab --- /dev/null +++ b/Documentation/scheduler/sched-bwc.txt | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | CFS Bandwidth Control | ||
| 2 | ===================== | ||
| 3 | |||
| 4 | [ This document only discusses CPU bandwidth control for SCHED_NORMAL. | ||
| 5 | The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] | ||
| 6 | |||
| 7 | CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the | ||
| 8 | specification of the maximum CPU bandwidth available to a group or hierarchy. | ||
| 9 | |||
| 10 | The bandwidth allowed for a group is specified using a quota and period. Within | ||
| 11 | each given "period" (microseconds), a group is allowed to consume only up to | ||
| 12 | "quota" microseconds of CPU time. When the CPU bandwidth consumption of a | ||
| 13 | group exceeds this limit (for that period), the tasks belonging to its | ||
| 14 | hierarchy will be throttled and are not allowed to run again until the next | ||
| 15 | period. | ||
| 16 | |||
| 17 | A group's unused runtime is globally tracked, being refreshed with quota units | ||
| 18 | above at each period boundary. As threads consume this bandwidth it is | ||
| 19 | transferred to cpu-local "silos" on a demand basis. The amount transferred | ||
| 20 | within each of these updates is tunable and described as the "slice". | ||
| 21 | |||
| 22 | Management | ||
| 23 | ---------- | ||
| 24 | Quota and period are managed within the cpu subsystem via cgroupfs. | ||
| 25 | |||
| 26 | cpu.cfs_quota_us: the total available run-time within a period (in microseconds) | ||
| 27 | cpu.cfs_period_us: the length of a period (in microseconds) | ||
| 28 | cpu.stat: exports throttling statistics [explained further below] | ||
| 29 | |||
| 30 | The default values are: | ||
| 31 | cpu.cfs_period_us=100ms | ||
| 32 | cpu.cfs_quota=-1 | ||
| 33 | |||
| 34 | A value of -1 for cpu.cfs_quota_us indicates that the group does not have any | ||
| 35 | bandwidth restriction in place, such a group is described as an unconstrained | ||
| 36 | bandwidth group. This represents the traditional work-conserving behavior for | ||
| 37 | CFS. | ||
| 38 | |||
| 39 | Writing any (valid) positive value(s) will enact the specified bandwidth limit. | ||
| 40 | The minimum quota allowed for the quota or period is 1ms. There is also an | ||
| 41 | upper bound on the period length of 1s. Additional restrictions exist when | ||
| 42 | bandwidth limits are used in a hierarchical fashion, these are explained in | ||
| 43 | more detail below. | ||
| 44 | |||
| 45 | Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit | ||
| 46 | and return the group to an unconstrained state once more. | ||
| 47 | |||
| 48 | Any updates to a group's bandwidth specification will result in it becoming | ||
| 49 | unthrottled if it is in a constrained state. | ||
| 50 | |||
| 51 | System wide settings | ||
| 52 | -------------------- | ||
| 53 | For efficiency run-time is transferred between the global pool and CPU local | ||
| 54 | "silos" in a batch fashion. This greatly reduces global accounting pressure | ||
| 55 | on large systems. The amount transferred each time such an update is required | ||
| 56 | is described as the "slice". | ||
| 57 | |||
| 58 | This is tunable via procfs: | ||
| 59 | /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) | ||
| 60 | |||
| 61 | Larger slice values will reduce transfer overheads, while smaller values allow | ||
| 62 | for more fine-grained consumption. | ||
| 63 | |||
| 64 | Statistics | ||
| 65 | ---------- | ||
| 66 | A group's bandwidth statistics are exported via 3 fields in cpu.stat. | ||
| 67 | |||
| 68 | cpu.stat: | ||
| 69 | - nr_periods: Number of enforcement intervals that have elapsed. | ||
| 70 | - nr_throttled: Number of times the group has been throttled/limited. | ||
| 71 | - throttled_time: The total time duration (in nanoseconds) for which entities | ||
| 72 | of the group have been throttled. | ||
| 73 | |||
| 74 | This interface is read-only. | ||
| 75 | |||
| 76 | Hierarchical considerations | ||
| 77 | --------------------------- | ||
| 78 | The interface enforces that an individual entity's bandwidth is always | ||
| 79 | attainable, that is: max(c_i) <= C. However, over-subscription in the | ||
| 80 | aggregate case is explicitly allowed to enable work-conserving semantics | ||
| 81 | within a hierarchy. | ||
| 82 | e.g. \Sum (c_i) may exceed C | ||
| 83 | [ Where C is the parent's bandwidth, and c_i its children ] | ||
| 84 | |||
| 85 | |||
| 86 | There are two ways in which a group may become throttled: | ||
| 87 | a. it fully consumes its own quota within a period | ||
| 88 | b. a parent's quota is fully consumed within its period | ||
| 89 | |||
| 90 | In case b) above, even though the child may have runtime remaining it will not | ||
| 91 | be allowed to until the parent's runtime is refreshed. | ||
| 92 | |||
| 93 | Examples | ||
| 94 | -------- | ||
| 95 | 1. Limit a group to 1 CPU worth of runtime. | ||
| 96 | |||
| 97 | If period is 250ms and quota is also 250ms, the group will get | ||
| 98 | 1 CPU worth of runtime every 250ms. | ||
| 99 | |||
| 100 | # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ | ||
| 101 | # echo 250000 > cpu.cfs_period_us /* period = 250ms */ | ||
| 102 | |||
| 103 | 2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. | ||
| 104 | |||
| 105 | With 500ms period and 1000ms quota, the group can get 2 CPUs worth of | ||
| 106 | runtime every 500ms. | ||
| 107 | |||
| 108 | # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ | ||
| 109 | # echo 500000 > cpu.cfs_period_us /* period = 500ms */ | ||
| 110 | |||
| 111 | The larger period here allows for increased burst capacity. | ||
| 112 | |||
| 113 | 3. Limit a group to 20% of 1 CPU. | ||
| 114 | |||
| 115 | With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. | ||
| 116 | |||
| 117 | # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ | ||
| 118 | # echo 50000 > cpu.cfs_period_us /* period = 50ms */ | ||
| 119 | |||
| 120 | By using a small period here we are ensuring a consistent latency | ||
| 121 | response at the expense of burst capacity. | ||
| 122 | |||
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index e3f47872ec22..f0c1ce95a0ec 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig | |||
| @@ -14,7 +14,6 @@ config ACPI_APEI_GHES | |||
| 14 | depends on ACPI_APEI && X86 | 14 | depends on ACPI_APEI && X86 |
| 15 | select ACPI_HED | 15 | select ACPI_HED |
| 16 | select IRQ_WORK | 16 | select IRQ_WORK |
| 17 | select LLIST | ||
| 18 | select GENERIC_ALLOCATOR | 17 | select GENERIC_ALLOCATOR |
| 19 | help | 18 | help |
| 20 | Generic Hardware Error Source provides a way to report | 19 | Generic Hardware Error Source provides a way to report |
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 4fa09d4d0b71..6a9e8f5399e2 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h | |||
| @@ -1,20 +1,23 @@ | |||
| 1 | #ifndef _LINUX_IRQ_WORK_H | 1 | #ifndef _LINUX_IRQ_WORK_H |
| 2 | #define _LINUX_IRQ_WORK_H | 2 | #define _LINUX_IRQ_WORK_H |
| 3 | 3 | ||
| 4 | #include <linux/llist.h> | ||
| 5 | |||
| 4 | struct irq_work { | 6 | struct irq_work { |
| 5 | struct irq_work *next; | 7 | unsigned long flags; |
| 8 | struct llist_node llnode; | ||
| 6 | void (*func)(struct irq_work *); | 9 | void (*func)(struct irq_work *); |
| 7 | }; | 10 | }; |
| 8 | 11 | ||
| 9 | static inline | 12 | static inline |
| 10 | void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *)) | 13 | void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) |
| 11 | { | 14 | { |
| 12 | entry->next = NULL; | 15 | work->flags = 0; |
| 13 | entry->func = func; | 16 | work->func = func; |
| 14 | } | 17 | } |
| 15 | 18 | ||
| 16 | bool irq_work_queue(struct irq_work *entry); | 19 | bool irq_work_queue(struct irq_work *work); |
| 17 | void irq_work_run(void); | 20 | void irq_work_run(void); |
| 18 | void irq_work_sync(struct irq_work *entry); | 21 | void irq_work_sync(struct irq_work *work); |
| 19 | 22 | ||
| 20 | #endif /* _LINUX_IRQ_WORK_H */ | 23 | #endif /* _LINUX_IRQ_WORK_H */ |
diff --git a/include/linux/llist.h b/include/linux/llist.h index aa0c8b5b3cd0..7287734e08d1 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h | |||
| @@ -35,10 +35,30 @@ | |||
| 35 | * | 35 | * |
| 36 | * The basic atomic operation of this list is cmpxchg on long. On | 36 | * The basic atomic operation of this list is cmpxchg on long. On |
| 37 | * architectures that don't have NMI-safe cmpxchg implementation, the | 37 | * architectures that don't have NMI-safe cmpxchg implementation, the |
| 38 | * list can NOT be used in NMI handler. So code uses the list in NMI | 38 | * list can NOT be used in NMI handlers. So code that uses the list in |
| 39 | * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. | 39 | * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. |
| 40 | * | ||
| 41 | * Copyright 2010,2011 Intel Corp. | ||
| 42 | * Author: Huang Ying <ying.huang@intel.com> | ||
| 43 | * | ||
| 44 | * This program is free software; you can redistribute it and/or | ||
| 45 | * modify it under the terms of the GNU General Public License version | ||
| 46 | * 2 as published by the Free Software Foundation; | ||
| 47 | * | ||
| 48 | * This program is distributed in the hope that it will be useful, | ||
| 49 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 50 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 51 | * GNU General Public License for more details. | ||
| 52 | * | ||
| 53 | * You should have received a copy of the GNU General Public License | ||
| 54 | * along with this program; if not, write to the Free Software | ||
| 55 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 40 | */ | 56 | */ |
| 41 | 57 | ||
| 58 | #include <linux/kernel.h> | ||
| 59 | #include <asm/system.h> | ||
| 60 | #include <asm/processor.h> | ||
| 61 | |||
| 42 | struct llist_head { | 62 | struct llist_head { |
| 43 | struct llist_node *first; | 63 | struct llist_node *first; |
| 44 | }; | 64 | }; |
| @@ -113,14 +133,55 @@ static inline void init_llist_head(struct llist_head *list) | |||
| 113 | * test whether the list is empty without deleting something from the | 133 | * test whether the list is empty without deleting something from the |
| 114 | * list. | 134 | * list. |
| 115 | */ | 135 | */ |
| 116 | static inline int llist_empty(const struct llist_head *head) | 136 | static inline bool llist_empty(const struct llist_head *head) |
| 117 | { | 137 | { |
| 118 | return ACCESS_ONCE(head->first) == NULL; | 138 | return ACCESS_ONCE(head->first) == NULL; |
| 119 | } | 139 | } |
| 120 | 140 | ||
| 121 | void llist_add(struct llist_node *new, struct llist_head *head); | 141 | static inline struct llist_node *llist_next(struct llist_node *node) |
| 122 | void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, | 142 | { |
| 123 | struct llist_head *head); | 143 | return node->next; |
| 124 | struct llist_node *llist_del_first(struct llist_head *head); | 144 | } |
| 125 | struct llist_node *llist_del_all(struct llist_head *head); | 145 | |
| 146 | /** | ||
| 147 | * llist_add - add a new entry | ||
| 148 | * @new: new entry to be added | ||
| 149 | * @head: the head for your lock-less list | ||
| 150 | * | ||
| 151 | * Return whether list is empty before adding. | ||
| 152 | */ | ||
| 153 | static inline bool llist_add(struct llist_node *new, struct llist_head *head) | ||
| 154 | { | ||
| 155 | struct llist_node *entry, *old_entry; | ||
| 156 | |||
| 157 | entry = head->first; | ||
| 158 | for (;;) { | ||
| 159 | old_entry = entry; | ||
| 160 | new->next = entry; | ||
| 161 | entry = cmpxchg(&head->first, old_entry, new); | ||
| 162 | if (entry == old_entry) | ||
| 163 | break; | ||
| 164 | } | ||
| 165 | |||
| 166 | return old_entry == NULL; | ||
| 167 | } | ||
| 168 | |||
| 169 | /** | ||
| 170 | * llist_del_all - delete all entries from lock-less list | ||
| 171 | * @head: the head of lock-less list to delete all entries | ||
| 172 | * | ||
| 173 | * If list is empty, return NULL, otherwise, delete all entries and | ||
| 174 | * return the pointer to the first entry. The order of entries | ||
| 175 | * deleted is from the newest to the oldest added one. | ||
| 176 | */ | ||
| 177 | static inline struct llist_node *llist_del_all(struct llist_head *head) | ||
| 178 | { | ||
| 179 | return xchg(&head->first, NULL); | ||
| 180 | } | ||
| 181 | |||
| 182 | extern bool llist_add_batch(struct llist_node *new_first, | ||
| 183 | struct llist_node *new_last, | ||
| 184 | struct llist_head *head); | ||
| 185 | extern struct llist_node *llist_del_first(struct llist_head *head); | ||
| 186 | |||
| 126 | #endif /* LLIST_H */ | 187 | #endif /* LLIST_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index ede8a6585e38..e8acce717d2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -90,6 +90,7 @@ struct sched_param { | |||
| 90 | #include <linux/task_io_accounting.h> | 90 | #include <linux/task_io_accounting.h> |
| 91 | #include <linux/latencytop.h> | 91 | #include <linux/latencytop.h> |
| 92 | #include <linux/cred.h> | 92 | #include <linux/cred.h> |
| 93 | #include <linux/llist.h> | ||
| 93 | 94 | ||
| 94 | #include <asm/processor.h> | 95 | #include <asm/processor.h> |
| 95 | 96 | ||
| @@ -1224,7 +1225,7 @@ struct task_struct { | |||
| 1224 | unsigned int ptrace; | 1225 | unsigned int ptrace; |
| 1225 | 1226 | ||
| 1226 | #ifdef CONFIG_SMP | 1227 | #ifdef CONFIG_SMP |
| 1227 | struct task_struct *wake_entry; | 1228 | struct llist_node wake_entry; |
| 1228 | int on_cpu; | 1229 | int on_cpu; |
| 1229 | #endif | 1230 | #endif |
| 1230 | int on_rq; | 1231 | int on_rq; |
| @@ -2035,6 +2036,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } | |||
| 2035 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } | 2036 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } |
| 2036 | #endif | 2037 | #endif |
| 2037 | 2038 | ||
| 2039 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 2040 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; | ||
| 2041 | #endif | ||
| 2042 | |||
| 2038 | #ifdef CONFIG_RT_MUTEXES | 2043 | #ifdef CONFIG_RT_MUTEXES |
| 2039 | extern int rt_mutex_getprio(struct task_struct *p); | 2044 | extern int rt_mutex_getprio(struct task_struct *p); |
| 2040 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 2045 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f6334782a593..959ff18b63b6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
| @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) | |||
| 100 | * For all intents and purposes a preempted task is a running task. | 100 | * For all intents and purposes a preempted task is a running task. |
| 101 | */ | 101 | */ |
| 102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) | 102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) |
| 103 | state = TASK_RUNNING; | 103 | state = TASK_RUNNING | TASK_STATE_MAX; |
| 104 | #endif | 104 | #endif |
| 105 | 105 | ||
| 106 | return state; | 106 | return state; |
| @@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch, | |||
| 137 | __entry->next_prio = next->prio; | 137 | __entry->next_prio = next->prio; |
| 138 | ), | 138 | ), |
| 139 | 139 | ||
| 140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", | 140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", |
| 141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, | 141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, |
| 142 | __entry->prev_state ? | 142 | __entry->prev_state & (TASK_STATE_MAX-1) ? |
| 143 | __print_flags(__entry->prev_state, "|", | 143 | __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", |
| 144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, | 144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, |
| 145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, | 145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, |
| 146 | { 128, "W" }) : "R", | 146 | { 128, "W" }) : "R", |
| 147 | __entry->prev_state & TASK_STATE_MAX ? "+" : "", | ||
| 147 | __entry->next_comm, __entry->next_pid, __entry->next_prio) | 148 | __entry->next_comm, __entry->next_pid, __entry->next_prio) |
| 148 | ); | 149 | ); |
| 149 | 150 | ||
diff --git a/init/Kconfig b/init/Kconfig index dc7e27bf89a8..31ba0fd0f36b 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED | |||
| 715 | depends on CGROUP_SCHED | 715 | depends on CGROUP_SCHED |
| 716 | default CGROUP_SCHED | 716 | default CGROUP_SCHED |
| 717 | 717 | ||
| 718 | config CFS_BANDWIDTH | ||
| 719 | bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" | ||
| 720 | depends on EXPERIMENTAL | ||
| 721 | depends on FAIR_GROUP_SCHED | ||
| 722 | default n | ||
| 723 | help | ||
| 724 | This option allows users to define CPU bandwidth rates (limits) for | ||
| 725 | tasks running within the fair group scheduler. Groups with no limit | ||
| 726 | set are considered to be unconstrained and will run with no | ||
| 727 | restriction. | ||
| 728 | See tip/Documentation/scheduler/sched-bwc.txt for more information. | ||
| 729 | |||
| 718 | config RT_GROUP_SCHED | 730 | config RT_GROUP_SCHED |
| 719 | bool "Group scheduling for SCHED_RR/FIFO" | 731 | bool "Group scheduling for SCHED_RR/FIFO" |
| 720 | depends on EXPERIMENTAL | 732 | depends on EXPERIMENTAL |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8aef..0e2cde4f380b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -17,54 +17,34 @@ | |||
| 17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | 17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued |
| 18 | * pending next, 3 -> {busy} : queued, pending callback | 18 | * pending next, 3 -> {busy} : queued, pending callback |
| 19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | 19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed |
| 20 | * | ||
| 21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
| 22 | * flags. | ||
| 23 | */ | 20 | */ |
| 24 | 21 | ||
| 25 | #define IRQ_WORK_PENDING 1UL | 22 | #define IRQ_WORK_PENDING 1UL |
| 26 | #define IRQ_WORK_BUSY 2UL | 23 | #define IRQ_WORK_BUSY 2UL |
| 27 | #define IRQ_WORK_FLAGS 3UL | 24 | #define IRQ_WORK_FLAGS 3UL |
| 28 | 25 | ||
| 29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | 26 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); |
| 30 | { | ||
| 31 | return (unsigned long)entry->next & flags; | ||
| 32 | } | ||
| 33 | |||
| 34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
| 35 | { | ||
| 36 | unsigned long next = (unsigned long)entry->next; | ||
| 37 | next &= ~IRQ_WORK_FLAGS; | ||
| 38 | return (struct irq_work *)next; | ||
| 39 | } | ||
| 40 | |||
| 41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
| 42 | { | ||
| 43 | unsigned long next = (unsigned long)entry; | ||
| 44 | next |= flags; | ||
| 45 | return (struct irq_work *)next; | ||
| 46 | } | ||
| 47 | |||
| 48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
| 49 | 27 | ||
| 50 | /* | 28 | /* |
| 51 | * Claim the entry so that no one else will poke at it. | 29 | * Claim the entry so that no one else will poke at it. |
| 52 | */ | 30 | */ |
| 53 | static bool irq_work_claim(struct irq_work *entry) | 31 | static bool irq_work_claim(struct irq_work *work) |
| 54 | { | 32 | { |
| 55 | struct irq_work *next, *nflags; | 33 | unsigned long flags, nflags; |
| 56 | 34 | ||
| 57 | do { | 35 | for (;;) { |
| 58 | next = entry->next; | 36 | flags = work->flags; |
| 59 | if ((unsigned long)next & IRQ_WORK_PENDING) | 37 | if (flags & IRQ_WORK_PENDING) |
| 60 | return false; | 38 | return false; |
| 61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | 39 | nflags = flags | IRQ_WORK_FLAGS; |
| 62 | } while (cmpxchg(&entry->next, next, nflags) != next); | 40 | if (cmpxchg(&work->flags, flags, nflags) == flags) |
| 41 | break; | ||
| 42 | cpu_relax(); | ||
| 43 | } | ||
| 63 | 44 | ||
| 64 | return true; | 45 | return true; |
| 65 | } | 46 | } |
| 66 | 47 | ||
| 67 | |||
| 68 | void __weak arch_irq_work_raise(void) | 48 | void __weak arch_irq_work_raise(void) |
| 69 | { | 49 | { |
| 70 | /* | 50 | /* |
| @@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void) | |||
| 75 | /* | 55 | /* |
| 76 | * Queue the entry and raise the IPI if needed. | 56 | * Queue the entry and raise the IPI if needed. |
| 77 | */ | 57 | */ |
| 78 | static void __irq_work_queue(struct irq_work *entry) | 58 | static void __irq_work_queue(struct irq_work *work) |
| 79 | { | 59 | { |
| 80 | struct irq_work *next; | 60 | bool empty; |
| 81 | 61 | ||
| 82 | preempt_disable(); | 62 | preempt_disable(); |
| 83 | 63 | ||
| 84 | do { | 64 | empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); |
| 85 | next = __this_cpu_read(irq_work_list); | ||
| 86 | /* Can assign non-atomic because we keep the flags set. */ | ||
| 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
| 88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); | ||
| 89 | |||
| 90 | /* The list was empty, raise self-interrupt to start processing. */ | 65 | /* The list was empty, raise self-interrupt to start processing. */ |
| 91 | if (!irq_work_next(entry)) | 66 | if (empty) |
| 92 | arch_irq_work_raise(); | 67 | arch_irq_work_raise(); |
| 93 | 68 | ||
| 94 | preempt_enable(); | 69 | preempt_enable(); |
| @@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry) | |||
| 100 | * | 75 | * |
| 101 | * Can be re-enqueued while the callback is still in progress. | 76 | * Can be re-enqueued while the callback is still in progress. |
| 102 | */ | 77 | */ |
| 103 | bool irq_work_queue(struct irq_work *entry) | 78 | bool irq_work_queue(struct irq_work *work) |
| 104 | { | 79 | { |
| 105 | if (!irq_work_claim(entry)) { | 80 | if (!irq_work_claim(work)) { |
| 106 | /* | 81 | /* |
| 107 | * Already enqueued, can't do! | 82 | * Already enqueued, can't do! |
| 108 | */ | 83 | */ |
| 109 | return false; | 84 | return false; |
| 110 | } | 85 | } |
| 111 | 86 | ||
| 112 | __irq_work_queue(entry); | 87 | __irq_work_queue(work); |
| 113 | return true; | 88 | return true; |
| 114 | } | 89 | } |
| 115 | EXPORT_SYMBOL_GPL(irq_work_queue); | 90 | EXPORT_SYMBOL_GPL(irq_work_queue); |
| @@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
| 120 | */ | 95 | */ |
| 121 | void irq_work_run(void) | 96 | void irq_work_run(void) |
| 122 | { | 97 | { |
| 123 | struct irq_work *list; | 98 | struct irq_work *work; |
| 99 | struct llist_head *this_list; | ||
| 100 | struct llist_node *llnode; | ||
| 124 | 101 | ||
| 125 | if (this_cpu_read(irq_work_list) == NULL) | 102 | this_list = &__get_cpu_var(irq_work_list); |
| 103 | if (llist_empty(this_list)) | ||
| 126 | return; | 104 | return; |
| 127 | 105 | ||
| 128 | BUG_ON(!in_irq()); | 106 | BUG_ON(!in_irq()); |
| 129 | BUG_ON(!irqs_disabled()); | 107 | BUG_ON(!irqs_disabled()); |
| 130 | 108 | ||
| 131 | list = this_cpu_xchg(irq_work_list, NULL); | 109 | llnode = llist_del_all(this_list); |
| 132 | 110 | while (llnode != NULL) { | |
| 133 | while (list != NULL) { | 111 | work = llist_entry(llnode, struct irq_work, llnode); |
| 134 | struct irq_work *entry = list; | ||
| 135 | 112 | ||
| 136 | list = irq_work_next(list); | 113 | llnode = llist_next(llnode); |
| 137 | 114 | ||
| 138 | /* | 115 | /* |
| 139 | * Clear the PENDING bit, after this point the @entry | 116 | * Clear the PENDING bit, after this point the @work |
| 140 | * can be re-used. | 117 | * can be re-used. |
| 141 | */ | 118 | */ |
| 142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | 119 | work->flags = IRQ_WORK_BUSY; |
| 143 | entry->func(entry); | 120 | work->func(work); |
| 144 | /* | 121 | /* |
| 145 | * Clear the BUSY bit and return to the free state if | 122 | * Clear the BUSY bit and return to the free state if |
| 146 | * no-one else claimed it meanwhile. | 123 | * no-one else claimed it meanwhile. |
| 147 | */ | 124 | */ |
| 148 | (void)cmpxchg(&entry->next, | 125 | (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); |
| 149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
| 150 | NULL); | ||
| 151 | } | 126 | } |
| 152 | } | 127 | } |
| 153 | EXPORT_SYMBOL_GPL(irq_work_run); | 128 | EXPORT_SYMBOL_GPL(irq_work_run); |
| @@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); | |||
| 156 | * Synchronize against the irq_work @entry, ensures the entry is not | 131 | * Synchronize against the irq_work @entry, ensures the entry is not |
| 157 | * currently in use. | 132 | * currently in use. |
| 158 | */ | 133 | */ |
| 159 | void irq_work_sync(struct irq_work *entry) | 134 | void irq_work_sync(struct irq_work *work) |
| 160 | { | 135 | { |
| 161 | WARN_ON_ONCE(irqs_disabled()); | 136 | WARN_ON_ONCE(irqs_disabled()); |
| 162 | 137 | ||
| 163 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | 138 | while (work->flags & IRQ_WORK_BUSY) |
| 164 | cpu_relax(); | 139 | cpu_relax(); |
| 165 | } | 140 | } |
| 166 | EXPORT_SYMBOL_GPL(irq_work_sync); | 141 | EXPORT_SYMBOL_GPL(irq_work_sync); |
diff --git a/kernel/sched.c b/kernel/sched.c index 03ad0113801a..d87c6e5d4e8c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
| 196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
| 197 | } | 197 | } |
| 198 | 198 | ||
| 199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
| 200 | { | 200 | { |
| 201 | ktime_t now; | 201 | unsigned long delta; |
| 202 | ktime_t soft, hard, now; | ||
| 203 | |||
| 204 | for (;;) { | ||
| 205 | if (hrtimer_active(period_timer)) | ||
| 206 | break; | ||
| 207 | |||
| 208 | now = hrtimer_cb_get_time(period_timer); | ||
| 209 | hrtimer_forward(period_timer, now, period); | ||
| 202 | 210 | ||
| 211 | soft = hrtimer_get_softexpires(period_timer); | ||
| 212 | hard = hrtimer_get_expires(period_timer); | ||
| 213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
| 214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
| 215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
| 216 | } | ||
| 217 | } | ||
| 218 | |||
| 219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 220 | { | ||
| 203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
| 204 | return; | 222 | return; |
| 205 | 223 | ||
| @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 207 | return; | 225 | return; |
| 208 | 226 | ||
| 209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
| 210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
| 211 | unsigned long delta; | ||
| 212 | ktime_t soft, hard; | ||
| 213 | |||
| 214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 215 | break; | ||
| 216 | |||
| 217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
| 218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
| 219 | |||
| 220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
| 221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
| 222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
| 223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
| 224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
| 225 | } | ||
| 226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
| 227 | } | 230 | } |
| 228 | 231 | ||
| @@ -247,6 +250,24 @@ struct cfs_rq; | |||
| 247 | 250 | ||
| 248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
| 249 | 252 | ||
| 253 | struct cfs_bandwidth { | ||
| 254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 255 | raw_spinlock_t lock; | ||
| 256 | ktime_t period; | ||
| 257 | u64 quota, runtime; | ||
| 258 | s64 hierarchal_quota; | ||
| 259 | u64 runtime_expires; | ||
| 260 | |||
| 261 | int idle, timer_active; | ||
| 262 | struct hrtimer period_timer, slack_timer; | ||
| 263 | struct list_head throttled_cfs_rq; | ||
| 264 | |||
| 265 | /* statistics */ | ||
| 266 | int nr_periods, nr_throttled; | ||
| 267 | u64 throttled_time; | ||
| 268 | #endif | ||
| 269 | }; | ||
| 270 | |||
| 250 | /* task group related information */ | 271 | /* task group related information */ |
| 251 | struct task_group { | 272 | struct task_group { |
| 252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
| @@ -278,6 +299,8 @@ struct task_group { | |||
| 278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
| 280 | #endif | 301 | #endif |
| 302 | |||
| 303 | struct cfs_bandwidth cfs_bandwidth; | ||
| 281 | }; | 304 | }; |
| 282 | 305 | ||
| 283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
| @@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
| 311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
| 312 | struct cfs_rq { | 335 | struct cfs_rq { |
| 313 | struct load_weight load; | 336 | struct load_weight load; |
| 314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
| 315 | 338 | ||
| 316 | u64 exec_clock; | 339 | u64 exec_clock; |
| 317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
| @@ -377,9 +400,120 @@ struct cfs_rq { | |||
| 377 | 400 | ||
| 378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
| 379 | #endif | 402 | #endif |
| 403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 404 | int runtime_enabled; | ||
| 405 | u64 runtime_expires; | ||
| 406 | s64 runtime_remaining; | ||
| 407 | |||
| 408 | u64 throttled_timestamp; | ||
| 409 | int throttled, throttle_count; | ||
| 410 | struct list_head throttled_list; | ||
| 411 | #endif | ||
| 380 | #endif | 412 | #endif |
| 381 | }; | 413 | }; |
| 382 | 414 | ||
| 415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 418 | { | ||
| 419 | return &tg->cfs_bandwidth; | ||
| 420 | } | ||
| 421 | |||
| 422 | static inline u64 default_cfs_period(void); | ||
| 423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
| 424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
| 425 | |||
| 426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
| 427 | { | ||
| 428 | struct cfs_bandwidth *cfs_b = | ||
| 429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
| 430 | do_sched_cfs_slack_timer(cfs_b); | ||
| 431 | |||
| 432 | return HRTIMER_NORESTART; | ||
| 433 | } | ||
| 434 | |||
| 435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
| 436 | { | ||
| 437 | struct cfs_bandwidth *cfs_b = | ||
| 438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
| 439 | ktime_t now; | ||
| 440 | int overrun; | ||
| 441 | int idle = 0; | ||
| 442 | |||
| 443 | for (;;) { | ||
| 444 | now = hrtimer_cb_get_time(timer); | ||
| 445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
| 446 | |||
| 447 | if (!overrun) | ||
| 448 | break; | ||
| 449 | |||
| 450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
| 451 | } | ||
| 452 | |||
| 453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 454 | } | ||
| 455 | |||
| 456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 457 | { | ||
| 458 | raw_spin_lock_init(&cfs_b->lock); | ||
| 459 | cfs_b->runtime = 0; | ||
| 460 | cfs_b->quota = RUNTIME_INF; | ||
| 461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
| 462 | |||
| 463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
| 464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
| 466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
| 468 | } | ||
| 469 | |||
| 470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 471 | { | ||
| 472 | cfs_rq->runtime_enabled = 0; | ||
| 473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
| 474 | } | ||
| 475 | |||
| 476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
| 477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 478 | { | ||
| 479 | /* | ||
| 480 | * The timer may be active because we're trying to set a new bandwidth | ||
| 481 | * period or because we're racing with the tear-down path | ||
| 482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
| 483 | * terminates). In either case we ensure that it's re-programmed | ||
| 484 | */ | ||
| 485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
| 486 | raw_spin_unlock(&cfs_b->lock); | ||
| 487 | /* ensure cfs_b->lock is available while we wait */ | ||
| 488 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 489 | |||
| 490 | raw_spin_lock(&cfs_b->lock); | ||
| 491 | /* if someone else restarted the timer then we're done */ | ||
| 492 | if (cfs_b->timer_active) | ||
| 493 | return; | ||
| 494 | } | ||
| 495 | |||
| 496 | cfs_b->timer_active = 1; | ||
| 497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
| 498 | } | ||
| 499 | |||
| 500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 501 | { | ||
| 502 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
| 504 | } | ||
| 505 | #else | ||
| 506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
| 507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 509 | |||
| 510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 511 | { | ||
| 512 | return NULL; | ||
| 513 | } | ||
| 514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
| 515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 516 | |||
| 383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
| 384 | struct rt_rq { | 518 | struct rt_rq { |
| 385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
| @@ -510,7 +644,7 @@ struct rq { | |||
| 510 | 644 | ||
| 511 | unsigned long cpu_power; | 645 | unsigned long cpu_power; |
| 512 | 646 | ||
| 513 | unsigned char idle_at_tick; | 647 | unsigned char idle_balance; |
| 514 | /* For active balancing */ | 648 | /* For active balancing */ |
| 515 | int post_schedule; | 649 | int post_schedule; |
| 516 | int active_balance; | 650 | int active_balance; |
| @@ -520,8 +654,6 @@ struct rq { | |||
| 520 | int cpu; | 654 | int cpu; |
| 521 | int online; | 655 | int online; |
| 522 | 656 | ||
| 523 | unsigned long avg_load_per_task; | ||
| 524 | |||
| 525 | u64 rt_avg; | 657 | u64 rt_avg; |
| 526 | u64 age_stamp; | 658 | u64 age_stamp; |
| 527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
| @@ -570,7 +702,7 @@ struct rq { | |||
| 570 | #endif | 702 | #endif |
| 571 | 703 | ||
| 572 | #ifdef CONFIG_SMP | 704 | #ifdef CONFIG_SMP |
| 573 | struct task_struct *wake_list; | 705 | struct llist_head wake_list; |
| 574 | #endif | 706 | #endif |
| 575 | }; | 707 | }; |
| 576 | 708 | ||
| @@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) | |||
| 1272 | smp_send_reschedule(cpu); | 1404 | smp_send_reschedule(cpu); |
| 1273 | } | 1405 | } |
| 1274 | 1406 | ||
| 1407 | static inline bool got_nohz_idle_kick(void) | ||
| 1408 | { | ||
| 1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | ||
| 1410 | } | ||
| 1411 | |||
| 1412 | #else /* CONFIG_NO_HZ */ | ||
| 1413 | |||
| 1414 | static inline bool got_nohz_idle_kick(void) | ||
| 1415 | { | ||
| 1416 | return false; | ||
| 1417 | } | ||
| 1418 | |||
| 1275 | #endif /* CONFIG_NO_HZ */ | 1419 | #endif /* CONFIG_NO_HZ */ |
| 1276 | 1420 | ||
| 1277 | static u64 sched_avg_period(void) | 1421 | static u64 sched_avg_period(void) |
| @@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
| 1471 | update_load_sub(&rq->load, load); | 1615 | update_load_sub(&rq->load, load); |
| 1472 | } | 1616 | } |
| 1473 | 1617 | ||
| 1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
| 1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
| 1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1620 | typedef int (*tg_visitor)(struct task_group *, void *); |
| 1476 | 1621 | ||
| 1477 | /* | 1622 | /* |
| 1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
| 1479 | * leaving it for the final time. | 1624 | * node and @up when leaving it for the final time. |
| 1625 | * | ||
| 1626 | * Caller must hold rcu_lock or sufficient equivalent. | ||
| 1480 | */ | 1627 | */ |
| 1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1628 | static int walk_tg_tree_from(struct task_group *from, |
| 1629 | tg_visitor down, tg_visitor up, void *data) | ||
| 1482 | { | 1630 | { |
| 1483 | struct task_group *parent, *child; | 1631 | struct task_group *parent, *child; |
| 1484 | int ret; | 1632 | int ret; |
| 1485 | 1633 | ||
| 1486 | rcu_read_lock(); | 1634 | parent = from; |
| 1487 | parent = &root_task_group; | 1635 | |
| 1488 | down: | 1636 | down: |
| 1489 | ret = (*down)(parent, data); | 1637 | ret = (*down)(parent, data); |
| 1490 | if (ret) | 1638 | if (ret) |
| 1491 | goto out_unlock; | 1639 | goto out; |
| 1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1640 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
| 1493 | parent = child; | 1641 | parent = child; |
| 1494 | goto down; | 1642 | goto down; |
| @@ -1497,19 +1645,29 @@ up: | |||
| 1497 | continue; | 1645 | continue; |
| 1498 | } | 1646 | } |
| 1499 | ret = (*up)(parent, data); | 1647 | ret = (*up)(parent, data); |
| 1500 | if (ret) | 1648 | if (ret || parent == from) |
| 1501 | goto out_unlock; | 1649 | goto out; |
| 1502 | 1650 | ||
| 1503 | child = parent; | 1651 | child = parent; |
| 1504 | parent = parent->parent; | 1652 | parent = parent->parent; |
| 1505 | if (parent) | 1653 | if (parent) |
| 1506 | goto up; | 1654 | goto up; |
| 1507 | out_unlock: | 1655 | out: |
| 1508 | rcu_read_unlock(); | ||
| 1509 | |||
| 1510 | return ret; | 1656 | return ret; |
| 1511 | } | 1657 | } |
| 1512 | 1658 | ||
| 1659 | /* | ||
| 1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 1661 | * leaving it for the final time. | ||
| 1662 | * | ||
| 1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
| 1664 | */ | ||
| 1665 | |||
| 1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
| 1667 | { | ||
| 1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
| 1669 | } | ||
| 1670 | |||
| 1513 | static int tg_nop(struct task_group *tg, void *data) | 1671 | static int tg_nop(struct task_group *tg, void *data) |
| 1514 | { | 1672 | { |
| 1515 | return 0; | 1673 | return 0; |
| @@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
| 1570 | 1728 | ||
| 1571 | if (nr_running) | 1729 | if (nr_running) |
| 1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1730 | return rq->load.weight / nr_running; |
| 1573 | else | ||
| 1574 | rq->avg_load_per_task = 0; | ||
| 1575 | 1731 | ||
| 1576 | return rq->avg_load_per_task; | 1732 | return 0; |
| 1577 | } | 1733 | } |
| 1578 | 1734 | ||
| 1579 | #ifdef CONFIG_PREEMPT | 1735 | #ifdef CONFIG_PREEMPT |
| @@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1806 | rq->nr_uninterruptible--; | 1962 | rq->nr_uninterruptible--; |
| 1807 | 1963 | ||
| 1808 | enqueue_task(rq, p, flags); | 1964 | enqueue_task(rq, p, flags); |
| 1809 | inc_nr_running(rq); | ||
| 1810 | } | 1965 | } |
| 1811 | 1966 | ||
| 1812 | /* | 1967 | /* |
| @@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1818 | rq->nr_uninterruptible++; | 1973 | rq->nr_uninterruptible++; |
| 1819 | 1974 | ||
| 1820 | dequeue_task(rq, p, flags); | 1975 | dequeue_task(rq, p, flags); |
| 1821 | dec_nr_running(rq); | ||
| 1822 | } | 1976 | } |
| 1823 | 1977 | ||
| 1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1978 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| @@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
| 2390 | 2544 | ||
| 2391 | /* Look for allowed, online CPU in same node. */ | 2545 | /* Look for allowed, online CPU in same node. */ |
| 2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2546 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
| 2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2547 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
| 2394 | return dest_cpu; | 2548 | return dest_cpu; |
| 2395 | 2549 | ||
| 2396 | /* Any allowed, online CPU? */ | 2550 | /* Any allowed, online CPU? */ |
| 2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2551 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
| 2398 | if (dest_cpu < nr_cpu_ids) | 2552 | if (dest_cpu < nr_cpu_ids) |
| 2399 | return dest_cpu; | 2553 | return dest_cpu; |
| 2400 | 2554 | ||
| @@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
| 2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2585 | * [ this allows ->select_task() to simply return task_cpu(p) and |
| 2432 | * not worry about this generic constraint ] | 2586 | * not worry about this generic constraint ] |
| 2433 | */ | 2587 | */ |
| 2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2588 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
| 2435 | !cpu_online(cpu))) | 2589 | !cpu_online(cpu))) |
| 2436 | cpu = select_fallback_rq(task_cpu(p), p); | 2590 | cpu = select_fallback_rq(task_cpu(p), p); |
| 2437 | 2591 | ||
| @@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
| 2556 | } | 2710 | } |
| 2557 | 2711 | ||
| 2558 | #ifdef CONFIG_SMP | 2712 | #ifdef CONFIG_SMP |
| 2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 2713 | static void sched_ttwu_pending(void) |
| 2560 | { | 2714 | { |
| 2561 | struct rq *rq = this_rq(); | 2715 | struct rq *rq = this_rq(); |
| 2716 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
| 2717 | struct task_struct *p; | ||
| 2562 | 2718 | ||
| 2563 | raw_spin_lock(&rq->lock); | 2719 | raw_spin_lock(&rq->lock); |
| 2564 | 2720 | ||
| 2565 | while (list) { | 2721 | while (llist) { |
| 2566 | struct task_struct *p = list; | 2722 | p = llist_entry(llist, struct task_struct, wake_entry); |
| 2567 | list = list->wake_entry; | 2723 | llist = llist_next(llist); |
| 2568 | ttwu_do_activate(rq, p, 0); | 2724 | ttwu_do_activate(rq, p, 0); |
| 2569 | } | 2725 | } |
| 2570 | 2726 | ||
| 2571 | raw_spin_unlock(&rq->lock); | 2727 | raw_spin_unlock(&rq->lock); |
| 2572 | } | 2728 | } |
| 2573 | 2729 | ||
| 2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2575 | |||
| 2576 | static void sched_ttwu_pending(void) | ||
| 2577 | { | ||
| 2578 | struct rq *rq = this_rq(); | ||
| 2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
| 2580 | |||
| 2581 | if (!list) | ||
| 2582 | return; | ||
| 2583 | |||
| 2584 | sched_ttwu_do_pending(list); | ||
| 2585 | } | ||
| 2586 | |||
| 2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 2588 | |||
| 2589 | void scheduler_ipi(void) | 2730 | void scheduler_ipi(void) |
| 2590 | { | 2731 | { |
| 2591 | struct rq *rq = this_rq(); | 2732 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
| 2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
| 2593 | |||
| 2594 | if (!list) | ||
| 2595 | return; | 2733 | return; |
| 2596 | 2734 | ||
| 2597 | /* | 2735 | /* |
| @@ -2608,25 +2746,21 @@ void scheduler_ipi(void) | |||
| 2608 | * somewhat pessimize the simple resched case. | 2746 | * somewhat pessimize the simple resched case. |
| 2609 | */ | 2747 | */ |
| 2610 | irq_enter(); | 2748 | irq_enter(); |
| 2611 | sched_ttwu_do_pending(list); | 2749 | sched_ttwu_pending(); |
| 2750 | |||
| 2751 | /* | ||
| 2752 | * Check if someone kicked us for doing the nohz idle load balance. | ||
| 2753 | */ | ||
| 2754 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
| 2755 | this_rq()->idle_balance = 1; | ||
| 2756 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
| 2757 | } | ||
| 2612 | irq_exit(); | 2758 | irq_exit(); |
| 2613 | } | 2759 | } |
| 2614 | 2760 | ||
| 2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2761 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
| 2616 | { | 2762 | { |
| 2617 | struct rq *rq = cpu_rq(cpu); | 2763 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
| 2618 | struct task_struct *next = rq->wake_list; | ||
| 2619 | |||
| 2620 | for (;;) { | ||
| 2621 | struct task_struct *old = next; | ||
| 2622 | |||
| 2623 | p->wake_entry = next; | ||
| 2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
| 2625 | if (next == old) | ||
| 2626 | break; | ||
| 2627 | } | ||
| 2628 | |||
| 2629 | if (!next) | ||
| 2630 | smp_send_reschedule(cpu); | 2764 | smp_send_reschedule(cpu); |
| 2631 | } | 2765 | } |
| 2632 | 2766 | ||
| @@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p) | |||
| 2848 | p->state = TASK_RUNNING; | 2982 | p->state = TASK_RUNNING; |
| 2849 | 2983 | ||
| 2850 | /* | 2984 | /* |
| 2985 | * Make sure we do not leak PI boosting priority to the child. | ||
| 2986 | */ | ||
| 2987 | p->prio = current->normal_prio; | ||
| 2988 | |||
| 2989 | /* | ||
| 2851 | * Revert to default priority/policy on fork if requested. | 2990 | * Revert to default priority/policy on fork if requested. |
| 2852 | */ | 2991 | */ |
| 2853 | if (unlikely(p->sched_reset_on_fork)) { | 2992 | if (unlikely(p->sched_reset_on_fork)) { |
| 2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2993 | if (task_has_rt_policy(p)) { |
| 2855 | p->policy = SCHED_NORMAL; | 2994 | p->policy = SCHED_NORMAL; |
| 2856 | p->normal_prio = p->static_prio; | ||
| 2857 | } | ||
| 2858 | |||
| 2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
| 2860 | p->static_prio = NICE_TO_PRIO(0); | 2995 | p->static_prio = NICE_TO_PRIO(0); |
| 2861 | p->normal_prio = p->static_prio; | 2996 | p->rt_priority = 0; |
| 2862 | set_load_weight(p); | 2997 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
| 2863 | } | 2998 | p->static_prio = NICE_TO_PRIO(0); |
| 2999 | |||
| 3000 | p->prio = p->normal_prio = __normal_prio(p); | ||
| 3001 | set_load_weight(p); | ||
| 2864 | 3002 | ||
| 2865 | /* | 3003 | /* |
| 2866 | * We don't need the reset flag anymore after the fork. It has | 3004 | * We don't need the reset flag anymore after the fork. It has |
| @@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p) | |||
| 2869 | p->sched_reset_on_fork = 0; | 3007 | p->sched_reset_on_fork = 0; |
| 2870 | } | 3008 | } |
| 2871 | 3009 | ||
| 2872 | /* | ||
| 2873 | * Make sure we do not leak PI boosting priority to the child. | ||
| 2874 | */ | ||
| 2875 | p->prio = current->normal_prio; | ||
| 2876 | |||
| 2877 | if (!rt_prio(p->prio)) | 3010 | if (!rt_prio(p->prio)) |
| 2878 | p->sched_class = &fair_sched_class; | 3011 | p->sched_class = &fair_sched_class; |
| 2879 | 3012 | ||
| @@ -4116,7 +4249,7 @@ void scheduler_tick(void) | |||
| 4116 | perf_event_task_tick(); | 4249 | perf_event_task_tick(); |
| 4117 | 4250 | ||
| 4118 | #ifdef CONFIG_SMP | 4251 | #ifdef CONFIG_SMP |
| 4119 | rq->idle_at_tick = idle_cpu(cpu); | 4252 | rq->idle_balance = idle_cpu(cpu); |
| 4120 | trigger_load_balance(rq, cpu); | 4253 | trigger_load_balance(rq, cpu); |
| 4121 | #endif | 4254 | #endif |
| 4122 | } | 4255 | } |
| @@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq) | |||
| 4240 | * Optimization: we know that if all tasks are in | 4373 | * Optimization: we know that if all tasks are in |
| 4241 | * the fair class we can call that function directly: | 4374 | * the fair class we can call that function directly: |
| 4242 | */ | 4375 | */ |
| 4243 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4376 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
| 4244 | p = fair_sched_class.pick_next_task(rq); | 4377 | p = fair_sched_class.pick_next_task(rq); |
| 4245 | if (likely(p)) | 4378 | if (likely(p)) |
| 4246 | return p; | 4379 | return p; |
| @@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice); | |||
| 5026 | */ | 5159 | */ |
| 5027 | int idle_cpu(int cpu) | 5160 | int idle_cpu(int cpu) |
| 5028 | { | 5161 | { |
| 5029 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 5162 | struct rq *rq = cpu_rq(cpu); |
| 5163 | |||
| 5164 | if (rq->curr != rq->idle) | ||
| 5165 | return 0; | ||
| 5166 | |||
| 5167 | if (rq->nr_running) | ||
| 5168 | return 0; | ||
| 5169 | |||
| 5170 | #ifdef CONFIG_SMP | ||
| 5171 | if (!llist_empty(&rq->wake_list)) | ||
| 5172 | return 0; | ||
| 5173 | #endif | ||
| 5174 | |||
| 5175 | return 1; | ||
| 5030 | } | 5176 | } |
| 5031 | 5177 | ||
| 5032 | /** | 5178 | /** |
| @@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 5876 | printk(KERN_INFO | 6022 | printk(KERN_INFO |
| 5877 | " task PC stack pid father\n"); | 6023 | " task PC stack pid father\n"); |
| 5878 | #endif | 6024 | #endif |
| 5879 | read_lock(&tasklist_lock); | 6025 | rcu_read_lock(); |
| 5880 | do_each_thread(g, p) { | 6026 | do_each_thread(g, p) { |
| 5881 | /* | 6027 | /* |
| 5882 | * reset the NMI-timeout, listing all files on a slow | 6028 | * reset the NMI-timeout, listing all files on a slow |
| @@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 5892 | #ifdef CONFIG_SCHED_DEBUG | 6038 | #ifdef CONFIG_SCHED_DEBUG |
| 5893 | sysrq_sched_debug_show(); | 6039 | sysrq_sched_debug_show(); |
| 5894 | #endif | 6040 | #endif |
| 5895 | read_unlock(&tasklist_lock); | 6041 | rcu_read_unlock(); |
| 5896 | /* | 6042 | /* |
| 5897 | * Only show locks if all tasks are dumped: | 6043 | * Only show locks if all tasks are dumped: |
| 5898 | */ | 6044 | */ |
| @@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 6007 | { | 6153 | { |
| 6008 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 6154 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
| 6009 | p->sched_class->set_cpus_allowed(p, new_mask); | 6155 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 6010 | else { | 6156 | |
| 6011 | cpumask_copy(&p->cpus_allowed, new_mask); | 6157 | cpumask_copy(&p->cpus_allowed, new_mask); |
| 6012 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 6158 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
| 6013 | } | ||
| 6014 | } | 6159 | } |
| 6015 | 6160 | ||
| 6016 | /* | 6161 | /* |
| @@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 6108 | if (task_cpu(p) != src_cpu) | 6253 | if (task_cpu(p) != src_cpu) |
| 6109 | goto done; | 6254 | goto done; |
| 6110 | /* Affinity changed (again). */ | 6255 | /* Affinity changed (again). */ |
| 6111 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6256 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
| 6112 | goto fail; | 6257 | goto fail; |
| 6113 | 6258 | ||
| 6114 | /* | 6259 | /* |
| @@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
| 6189 | rq->calc_load_active = 0; | 6334 | rq->calc_load_active = 0; |
| 6190 | } | 6335 | } |
| 6191 | 6336 | ||
| 6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
| 6339 | { | ||
| 6340 | struct cfs_rq *cfs_rq; | ||
| 6341 | |||
| 6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 6344 | |||
| 6345 | if (!cfs_rq->runtime_enabled) | ||
| 6346 | continue; | ||
| 6347 | |||
| 6348 | /* | ||
| 6349 | * clock_task is not advancing so we just need to make sure | ||
| 6350 | * there's some valid quota amount | ||
| 6351 | */ | ||
| 6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
| 6353 | if (cfs_rq_throttled(cfs_rq)) | ||
| 6354 | unthrottle_cfs_rq(cfs_rq); | ||
| 6355 | } | ||
| 6356 | } | ||
| 6357 | #else | ||
| 6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
| 6359 | #endif | ||
| 6360 | |||
| 6192 | /* | 6361 | /* |
| 6193 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
| 6194 | * try_to_wake_up()->select_task_rq(). | 6363 | * try_to_wake_up()->select_task_rq(). |
| @@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
| 6214 | */ | 6383 | */ |
| 6215 | rq->stop = NULL; | 6384 | rq->stop = NULL; |
| 6216 | 6385 | ||
| 6386 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
| 6387 | unthrottle_offline_cfs_rqs(rq); | ||
| 6388 | |||
| 6217 | for ( ; ; ) { | 6389 | for ( ; ; ) { |
| 6218 | /* | 6390 | /* |
| 6219 | * There's this thread running, bail when that's the only | 6391 | * There's this thread running, bail when that's the only |
| @@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 7957 | /* allow initial update_cfs_load() to truncate */ | 8129 | /* allow initial update_cfs_load() to truncate */ |
| 7958 | cfs_rq->load_stamp = 1; | 8130 | cfs_rq->load_stamp = 1; |
| 7959 | #endif | 8131 | #endif |
| 8132 | init_cfs_rq_runtime(cfs_rq); | ||
| 7960 | 8133 | ||
| 7961 | tg->cfs_rq[cpu] = cfs_rq; | 8134 | tg->cfs_rq[cpu] = cfs_rq; |
| 7962 | tg->se[cpu] = se; | 8135 | tg->se[cpu] = se; |
| @@ -8096,6 +8269,7 @@ void __init sched_init(void) | |||
| 8096 | * We achieve this by letting root_task_group's tasks sit | 8269 | * We achieve this by letting root_task_group's tasks sit |
| 8097 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8270 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
| 8098 | */ | 8271 | */ |
| 8272 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
| 8099 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8273 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
| 8100 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8274 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8101 | 8275 | ||
| @@ -8125,7 +8299,6 @@ void __init sched_init(void) | |||
| 8125 | rq_attach_root(rq, &def_root_domain); | 8299 | rq_attach_root(rq, &def_root_domain); |
| 8126 | #ifdef CONFIG_NO_HZ | 8300 | #ifdef CONFIG_NO_HZ |
| 8127 | rq->nohz_balance_kick = 0; | 8301 | rq->nohz_balance_kick = 0; |
| 8128 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
| 8129 | #endif | 8302 | #endif |
| 8130 | #endif | 8303 | #endif |
| 8131 | init_rq_hrtick(rq); | 8304 | init_rq_hrtick(rq); |
| @@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
| 8336 | { | 8509 | { |
| 8337 | int i; | 8510 | int i; |
| 8338 | 8511 | ||
| 8512 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 8513 | |||
| 8339 | for_each_possible_cpu(i) { | 8514 | for_each_possible_cpu(i) { |
| 8340 | if (tg->cfs_rq) | 8515 | if (tg->cfs_rq) |
| 8341 | kfree(tg->cfs_rq[i]); | 8516 | kfree(tg->cfs_rq[i]); |
| @@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8363 | 8538 | ||
| 8364 | tg->shares = NICE_0_LOAD; | 8539 | tg->shares = NICE_0_LOAD; |
| 8365 | 8540 | ||
| 8541 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 8542 | |||
| 8366 | for_each_possible_cpu(i) { | 8543 | for_each_possible_cpu(i) { |
| 8367 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8544 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
| 8368 | GFP_KERNEL, cpu_to_node(i)); | 8545 | GFP_KERNEL, cpu_to_node(i)); |
| @@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
| 8638 | } | 8815 | } |
| 8639 | #endif | 8816 | #endif |
| 8640 | 8817 | ||
| 8641 | #ifdef CONFIG_RT_GROUP_SCHED | 8818 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
| 8642 | /* | ||
| 8643 | * Ensure that the real time constraints are schedulable. | ||
| 8644 | */ | ||
| 8645 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
| 8646 | |||
| 8647 | static unsigned long to_ratio(u64 period, u64 runtime) | 8819 | static unsigned long to_ratio(u64 period, u64 runtime) |
| 8648 | { | 8820 | { |
| 8649 | if (runtime == RUNTIME_INF) | 8821 | if (runtime == RUNTIME_INF) |
| @@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
| 8651 | 8823 | ||
| 8652 | return div64_u64(runtime << 20, period); | 8824 | return div64_u64(runtime << 20, period); |
| 8653 | } | 8825 | } |
| 8826 | #endif | ||
| 8827 | |||
| 8828 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8829 | /* | ||
| 8830 | * Ensure that the real time constraints are schedulable. | ||
| 8831 | */ | ||
| 8832 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
| 8654 | 8833 | ||
| 8655 | /* Must be called with tasklist_lock held */ | 8834 | /* Must be called with tasklist_lock held */ |
| 8656 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8835 | static inline int tg_has_rt_tasks(struct task_group *tg) |
| @@ -8671,7 +8850,7 @@ struct rt_schedulable_data { | |||
| 8671 | u64 rt_runtime; | 8850 | u64 rt_runtime; |
| 8672 | }; | 8851 | }; |
| 8673 | 8852 | ||
| 8674 | static int tg_schedulable(struct task_group *tg, void *data) | 8853 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
| 8675 | { | 8854 | { |
| 8676 | struct rt_schedulable_data *d = data; | 8855 | struct rt_schedulable_data *d = data; |
| 8677 | struct task_group *child; | 8856 | struct task_group *child; |
| @@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
| 8729 | 8908 | ||
| 8730 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8909 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| 8731 | { | 8910 | { |
| 8911 | int ret; | ||
| 8912 | |||
| 8732 | struct rt_schedulable_data data = { | 8913 | struct rt_schedulable_data data = { |
| 8733 | .tg = tg, | 8914 | .tg = tg, |
| 8734 | .rt_period = period, | 8915 | .rt_period = period, |
| 8735 | .rt_runtime = runtime, | 8916 | .rt_runtime = runtime, |
| 8736 | }; | 8917 | }; |
| 8737 | 8918 | ||
| 8738 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8919 | rcu_read_lock(); |
| 8920 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
| 8921 | rcu_read_unlock(); | ||
| 8922 | |||
| 8923 | return ret; | ||
| 8739 | } | 8924 | } |
| 8740 | 8925 | ||
| 8741 | static int tg_set_bandwidth(struct task_group *tg, | 8926 | static int tg_set_rt_bandwidth(struct task_group *tg, |
| 8742 | u64 rt_period, u64 rt_runtime) | 8927 | u64 rt_period, u64 rt_runtime) |
| 8743 | { | 8928 | { |
| 8744 | int i, err = 0; | 8929 | int i, err = 0; |
| @@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
| 8777 | if (rt_runtime_us < 0) | 8962 | if (rt_runtime_us < 0) |
| 8778 | rt_runtime = RUNTIME_INF; | 8963 | rt_runtime = RUNTIME_INF; |
| 8779 | 8964 | ||
| 8780 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8965 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
| 8781 | } | 8966 | } |
| 8782 | 8967 | ||
| 8783 | long sched_group_rt_runtime(struct task_group *tg) | 8968 | long sched_group_rt_runtime(struct task_group *tg) |
| @@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
| 8802 | if (rt_period == 0) | 8987 | if (rt_period == 0) |
| 8803 | return -EINVAL; | 8988 | return -EINVAL; |
| 8804 | 8989 | ||
| 8805 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8990 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
| 8806 | } | 8991 | } |
| 8807 | 8992 | ||
| 8808 | long sched_group_rt_period(struct task_group *tg) | 8993 | long sched_group_rt_period(struct task_group *tg) |
| @@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
| 8992 | 9177 | ||
| 8993 | return (u64) scale_load_down(tg->shares); | 9178 | return (u64) scale_load_down(tg->shares); |
| 8994 | } | 9179 | } |
| 9180 | |||
| 9181 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 9182 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
| 9183 | |||
| 9184 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
| 9185 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
| 9186 | |||
| 9187 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
| 9188 | |||
| 9189 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
| 9190 | { | ||
| 9191 | int i, ret = 0, runtime_enabled; | ||
| 9192 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
| 9193 | |||
| 9194 | if (tg == &root_task_group) | ||
| 9195 | return -EINVAL; | ||
| 9196 | |||
| 9197 | /* | ||
| 9198 | * Ensure we have at some amount of bandwidth every period. This is | ||
| 9199 | * to prevent reaching a state of large arrears when throttled via | ||
| 9200 | * entity_tick() resulting in prolonged exit starvation. | ||
| 9201 | */ | ||
| 9202 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
| 9203 | return -EINVAL; | ||
| 9204 | |||
| 9205 | /* | ||
| 9206 | * Likewise, bound things on the otherside by preventing insane quota | ||
| 9207 | * periods. This also allows us to normalize in computing quota | ||
| 9208 | * feasibility. | ||
| 9209 | */ | ||
| 9210 | if (period > max_cfs_quota_period) | ||
| 9211 | return -EINVAL; | ||
| 9212 | |||
| 9213 | mutex_lock(&cfs_constraints_mutex); | ||
| 9214 | ret = __cfs_schedulable(tg, period, quota); | ||
| 9215 | if (ret) | ||
| 9216 | goto out_unlock; | ||
| 9217 | |||
| 9218 | runtime_enabled = quota != RUNTIME_INF; | ||
| 9219 | raw_spin_lock_irq(&cfs_b->lock); | ||
| 9220 | cfs_b->period = ns_to_ktime(period); | ||
| 9221 | cfs_b->quota = quota; | ||
| 9222 | |||
| 9223 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
| 9224 | /* restart the period timer (if active) to handle new period expiry */ | ||
| 9225 | if (runtime_enabled && cfs_b->timer_active) { | ||
| 9226 | /* force a reprogram */ | ||
| 9227 | cfs_b->timer_active = 0; | ||
| 9228 | __start_cfs_bandwidth(cfs_b); | ||
| 9229 | } | ||
| 9230 | raw_spin_unlock_irq(&cfs_b->lock); | ||
| 9231 | |||
| 9232 | for_each_possible_cpu(i) { | ||
| 9233 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
| 9234 | struct rq *rq = rq_of(cfs_rq); | ||
| 9235 | |||
| 9236 | raw_spin_lock_irq(&rq->lock); | ||
| 9237 | cfs_rq->runtime_enabled = runtime_enabled; | ||
| 9238 | cfs_rq->runtime_remaining = 0; | ||
| 9239 | |||
| 9240 | if (cfs_rq_throttled(cfs_rq)) | ||
| 9241 | unthrottle_cfs_rq(cfs_rq); | ||
| 9242 | raw_spin_unlock_irq(&rq->lock); | ||
| 9243 | } | ||
| 9244 | out_unlock: | ||
| 9245 | mutex_unlock(&cfs_constraints_mutex); | ||
| 9246 | |||
| 9247 | return ret; | ||
| 9248 | } | ||
| 9249 | |||
| 9250 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
| 9251 | { | ||
| 9252 | u64 quota, period; | ||
| 9253 | |||
| 9254 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
| 9255 | if (cfs_quota_us < 0) | ||
| 9256 | quota = RUNTIME_INF; | ||
| 9257 | else | ||
| 9258 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
| 9259 | |||
| 9260 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
| 9261 | } | ||
| 9262 | |||
| 9263 | long tg_get_cfs_quota(struct task_group *tg) | ||
| 9264 | { | ||
| 9265 | u64 quota_us; | ||
| 9266 | |||
| 9267 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
| 9268 | return -1; | ||
| 9269 | |||
| 9270 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
| 9271 | do_div(quota_us, NSEC_PER_USEC); | ||
| 9272 | |||
| 9273 | return quota_us; | ||
| 9274 | } | ||
| 9275 | |||
| 9276 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
| 9277 | { | ||
| 9278 | u64 quota, period; | ||
| 9279 | |||
| 9280 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
| 9281 | quota = tg_cfs_bandwidth(tg)->quota; | ||
| 9282 | |||
| 9283 | if (period <= 0) | ||
| 9284 | return -EINVAL; | ||
| 9285 | |||
| 9286 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
| 9287 | } | ||
| 9288 | |||
| 9289 | long tg_get_cfs_period(struct task_group *tg) | ||
| 9290 | { | ||
| 9291 | u64 cfs_period_us; | ||
| 9292 | |||
| 9293 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
| 9294 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
| 9295 | |||
| 9296 | return cfs_period_us; | ||
| 9297 | } | ||
| 9298 | |||
| 9299 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
| 9300 | { | ||
| 9301 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
| 9302 | } | ||
| 9303 | |||
| 9304 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
| 9305 | s64 cfs_quota_us) | ||
| 9306 | { | ||
| 9307 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
| 9308 | } | ||
| 9309 | |||
| 9310 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
| 9311 | { | ||
| 9312 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
| 9313 | } | ||
| 9314 | |||
| 9315 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
| 9316 | u64 cfs_period_us) | ||
| 9317 | { | ||
| 9318 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
| 9319 | } | ||
| 9320 | |||
| 9321 | struct cfs_schedulable_data { | ||
| 9322 | struct task_group *tg; | ||
| 9323 | u64 period, quota; | ||
| 9324 | }; | ||
| 9325 | |||
| 9326 | /* | ||
| 9327 | * normalize group quota/period to be quota/max_period | ||
| 9328 | * note: units are usecs | ||
| 9329 | */ | ||
| 9330 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
| 9331 | struct cfs_schedulable_data *d) | ||
| 9332 | { | ||
| 9333 | u64 quota, period; | ||
| 9334 | |||
| 9335 | if (tg == d->tg) { | ||
| 9336 | period = d->period; | ||
| 9337 | quota = d->quota; | ||
| 9338 | } else { | ||
| 9339 | period = tg_get_cfs_period(tg); | ||
| 9340 | quota = tg_get_cfs_quota(tg); | ||
| 9341 | } | ||
| 9342 | |||
| 9343 | /* note: these should typically be equivalent */ | ||
| 9344 | if (quota == RUNTIME_INF || quota == -1) | ||
| 9345 | return RUNTIME_INF; | ||
| 9346 | |||
| 9347 | return to_ratio(period, quota); | ||
| 9348 | } | ||
| 9349 | |||
| 9350 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
| 9351 | { | ||
| 9352 | struct cfs_schedulable_data *d = data; | ||
| 9353 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
| 9354 | s64 quota = 0, parent_quota = -1; | ||
| 9355 | |||
| 9356 | if (!tg->parent) { | ||
| 9357 | quota = RUNTIME_INF; | ||
| 9358 | } else { | ||
| 9359 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
| 9360 | |||
| 9361 | quota = normalize_cfs_quota(tg, d); | ||
| 9362 | parent_quota = parent_b->hierarchal_quota; | ||
| 9363 | |||
| 9364 | /* | ||
| 9365 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
| 9366 | * limit is set | ||
| 9367 | */ | ||
| 9368 | if (quota == RUNTIME_INF) | ||
| 9369 | quota = parent_quota; | ||
| 9370 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
| 9371 | return -EINVAL; | ||
| 9372 | } | ||
| 9373 | cfs_b->hierarchal_quota = quota; | ||
| 9374 | |||
| 9375 | return 0; | ||
| 9376 | } | ||
| 9377 | |||
| 9378 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
| 9379 | { | ||
| 9380 | int ret; | ||
| 9381 | struct cfs_schedulable_data data = { | ||
| 9382 | .tg = tg, | ||
| 9383 | .period = period, | ||
| 9384 | .quota = quota, | ||
| 9385 | }; | ||
| 9386 | |||
| 9387 | if (quota != RUNTIME_INF) { | ||
| 9388 | do_div(data.period, NSEC_PER_USEC); | ||
| 9389 | do_div(data.quota, NSEC_PER_USEC); | ||
| 9390 | } | ||
| 9391 | |||
| 9392 | rcu_read_lock(); | ||
| 9393 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
| 9394 | rcu_read_unlock(); | ||
| 9395 | |||
| 9396 | return ret; | ||
| 9397 | } | ||
| 9398 | |||
| 9399 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
| 9400 | struct cgroup_map_cb *cb) | ||
| 9401 | { | ||
| 9402 | struct task_group *tg = cgroup_tg(cgrp); | ||
| 9403 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
| 9404 | |||
| 9405 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
| 9406 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
| 9407 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
| 9408 | |||
| 9409 | return 0; | ||
| 9410 | } | ||
| 9411 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
| 8995 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9412 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8996 | 9413 | ||
| 8997 | #ifdef CONFIG_RT_GROUP_SCHED | 9414 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = { | |||
| 9026 | .write_u64 = cpu_shares_write_u64, | 9443 | .write_u64 = cpu_shares_write_u64, |
| 9027 | }, | 9444 | }, |
| 9028 | #endif | 9445 | #endif |
| 9446 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 9447 | { | ||
| 9448 | .name = "cfs_quota_us", | ||
| 9449 | .read_s64 = cpu_cfs_quota_read_s64, | ||
| 9450 | .write_s64 = cpu_cfs_quota_write_s64, | ||
| 9451 | }, | ||
| 9452 | { | ||
| 9453 | .name = "cfs_period_us", | ||
| 9454 | .read_u64 = cpu_cfs_period_read_u64, | ||
| 9455 | .write_u64 = cpu_cfs_period_write_u64, | ||
| 9456 | }, | ||
| 9457 | { | ||
| 9458 | .name = "stat", | ||
| 9459 | .read_map = cpu_stats_show, | ||
| 9460 | }, | ||
| 9461 | #endif | ||
| 9029 | #ifdef CONFIG_RT_GROUP_SCHED | 9462 | #ifdef CONFIG_RT_GROUP_SCHED |
| 9030 | { | 9463 | { |
| 9031 | .name = "rt_runtime_us", | 9464 | .name = "rt_runtime_us", |
| @@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
| 9335 | .subsys_id = cpuacct_subsys_id, | 9768 | .subsys_id = cpuacct_subsys_id, |
| 9336 | }; | 9769 | }; |
| 9337 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9770 | #endif /* CONFIG_CGROUP_CPUACCT */ |
| 9338 | |||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
| @@ -47,9 +47,6 @@ static int convert_prio(int prio) | |||
| 47 | return cpupri; | 47 | return cpupri; |
| 48 | } | 48 | } |
| 49 | 49 | ||
| 50 | #define for_each_cpupri_active(array, idx) \ | ||
| 51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) | ||
| 52 | |||
| 53 | /** | 50 | /** |
| 54 | * cpupri_find - find the best (lowest-pri) CPU in the system | 51 | * cpupri_find - find the best (lowest-pri) CPU in the system |
| 55 | * @cp: The cpupri context | 52 | * @cp: The cpupri context |
| @@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
| 71 | int idx = 0; | 68 | int idx = 0; |
| 72 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
| 73 | 70 | ||
| 74 | for_each_cpupri_active(cp->pri_active, idx) { | 71 | if (task_pri >= MAX_RT_PRIO) |
| 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 72 | return 0; |
| 76 | 73 | ||
| 77 | if (idx >= task_pri) | 74 | for (idx = 0; idx < task_pri; idx++) { |
| 78 | break; | 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
| 76 | int skip = 0; | ||
| 77 | |||
| 78 | if (!atomic_read(&(vec)->count)) | ||
| 79 | skip = 1; | ||
| 80 | /* | ||
| 81 | * When looking at the vector, we need to read the counter, | ||
| 82 | * do a memory barrier, then read the mask. | ||
| 83 | * | ||
| 84 | * Note: This is still all racey, but we can deal with it. | ||
| 85 | * Ideally, we only want to look at masks that are set. | ||
| 86 | * | ||
| 87 | * If a mask is not set, then the only thing wrong is that we | ||
| 88 | * did a little more work than necessary. | ||
| 89 | * | ||
| 90 | * If we read a zero count but the mask is set, because of the | ||
| 91 | * memory barriers, that can only happen when the highest prio | ||
| 92 | * task for a run queue has left the run queue, in which case, | ||
| 93 | * it will be followed by a pull. If the task we are processing | ||
| 94 | * fails to find a proper place to go, that pull request will | ||
| 95 | * pull this task if the run queue is running at a lower | ||
| 96 | * priority. | ||
| 97 | */ | ||
| 98 | smp_rmb(); | ||
| 99 | |||
| 100 | /* Need to do the rmb for every iteration */ | ||
| 101 | if (skip) | ||
| 102 | continue; | ||
| 79 | 103 | ||
| 80 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 104 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
| 81 | continue; | 105 | continue; |
| @@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 115 | { | 139 | { |
| 116 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
| 117 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
| 118 | unsigned long flags; | 142 | int do_mb = 0; |
| 119 | 143 | ||
| 120 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
| 121 | 145 | ||
| @@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 128 | * If the cpu was currently mapped to a different value, we | 152 | * If the cpu was currently mapped to a different value, we |
| 129 | * need to map it to the new value then remove the old value. | 153 | * need to map it to the new value then remove the old value. |
| 130 | * Note, we must add the new value first, otherwise we risk the | 154 | * Note, we must add the new value first, otherwise we risk the |
| 131 | * cpu being cleared from pri_active, and this cpu could be | 155 | * cpu being missed by the priority loop in cpupri_find. |
| 132 | * missed for a push or pull. | ||
| 133 | */ | 156 | */ |
| 134 | if (likely(newpri != CPUPRI_INVALID)) { | 157 | if (likely(newpri != CPUPRI_INVALID)) { |
| 135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 158 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
| 136 | 159 | ||
| 137 | raw_spin_lock_irqsave(&vec->lock, flags); | ||
| 138 | |||
| 139 | cpumask_set_cpu(cpu, vec->mask); | 160 | cpumask_set_cpu(cpu, vec->mask); |
| 140 | vec->count++; | 161 | /* |
| 141 | if (vec->count == 1) | 162 | * When adding a new vector, we update the mask first, |
| 142 | set_bit(newpri, cp->pri_active); | 163 | * do a write memory barrier, and then update the count, to |
| 143 | 164 | * make sure the vector is visible when count is set. | |
| 144 | raw_spin_unlock_irqrestore(&vec->lock, flags); | 165 | */ |
| 166 | smp_mb__before_atomic_inc(); | ||
| 167 | atomic_inc(&(vec)->count); | ||
| 168 | do_mb = 1; | ||
| 145 | } | 169 | } |
| 146 | if (likely(oldpri != CPUPRI_INVALID)) { | 170 | if (likely(oldpri != CPUPRI_INVALID)) { |
| 147 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | 171 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; |
| 148 | 172 | ||
| 149 | raw_spin_lock_irqsave(&vec->lock, flags); | 173 | /* |
| 150 | 174 | * Because the order of modification of the vec->count | |
| 151 | vec->count--; | 175 | * is important, we must make sure that the update |
| 152 | if (!vec->count) | 176 | * of the new prio is seen before we decrement the |
| 153 | clear_bit(oldpri, cp->pri_active); | 177 | * old prio. This makes sure that the loop sees |
| 178 | * one or the other when we raise the priority of | ||
| 179 | * the run queue. We don't care about when we lower the | ||
| 180 | * priority, as that will trigger an rt pull anyway. | ||
| 181 | * | ||
| 182 | * We only need to do a memory barrier if we updated | ||
| 183 | * the new priority vec. | ||
| 184 | */ | ||
| 185 | if (do_mb) | ||
| 186 | smp_mb__after_atomic_inc(); | ||
| 187 | |||
| 188 | /* | ||
| 189 | * When removing from the vector, we decrement the counter first | ||
| 190 | * do a memory barrier and then clear the mask. | ||
| 191 | */ | ||
| 192 | atomic_dec(&(vec)->count); | ||
| 193 | smp_mb__after_atomic_inc(); | ||
| 154 | cpumask_clear_cpu(cpu, vec->mask); | 194 | cpumask_clear_cpu(cpu, vec->mask); |
| 155 | |||
| 156 | raw_spin_unlock_irqrestore(&vec->lock, flags); | ||
| 157 | } | 195 | } |
| 158 | 196 | ||
| 159 | *currpri = newpri; | 197 | *currpri = newpri; |
| @@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) | |||
| 175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 213 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
| 176 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | 214 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; |
| 177 | 215 | ||
| 178 | raw_spin_lock_init(&vec->lock); | 216 | atomic_set(&vec->count, 0); |
| 179 | vec->count = 0; | ||
| 180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 217 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
| 181 | goto cleanup; | 218 | goto cleanup; |
| 182 | } | 219 | } |
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
| 5 | 5 | ||
| 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
| 7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
| 8 | 7 | ||
| 9 | #define CPUPRI_INVALID -1 | 8 | #define CPUPRI_INVALID -1 |
| 10 | #define CPUPRI_IDLE 0 | 9 | #define CPUPRI_IDLE 0 |
| @@ -12,14 +11,12 @@ | |||
| 12 | /* values 2-101 are RT priorities 0-99 */ | 11 | /* values 2-101 are RT priorities 0-99 */ |
| 13 | 12 | ||
| 14 | struct cpupri_vec { | 13 | struct cpupri_vec { |
| 15 | raw_spinlock_t lock; | 14 | atomic_t count; |
| 16 | int count; | 15 | cpumask_var_t mask; |
| 17 | cpumask_var_t mask; | ||
| 18 | }; | 16 | }; |
| 19 | 17 | ||
| 20 | struct cpupri { | 18 | struct cpupri { |
| 21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
| 22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
| 23 | int cpu_to_pri[NR_CPUS]; | 20 | int cpu_to_pri[NR_CPUS]; |
| 24 | }; | 21 | }; |
| 25 | 22 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee9993814..5c9e67923b7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
| 89 | */ | 89 | */ |
| 90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; |
| 91 | 91 | ||
| 92 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 93 | /* | ||
| 94 | * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool | ||
| 95 | * each time a cfs_rq requests quota. | ||
| 96 | * | ||
| 97 | * Note: in the case that the slice exceeds the runtime remaining (either due | ||
| 98 | * to consumption or the quota being specified to be smaller than the slice) | ||
| 99 | * we will always only issue the remaining available time. | ||
| 100 | * | ||
| 101 | * default: 5 msec, units: microseconds | ||
| 102 | */ | ||
| 103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | ||
| 104 | #endif | ||
| 105 | |||
| 92 | static const struct sched_class fair_sched_class; | 106 | static const struct sched_class fair_sched_class; |
| 93 | 107 | ||
| 94 | /************************************************************** | 108 | /************************************************************** |
| @@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
| 292 | 306 | ||
| 293 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 307 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 294 | 308 | ||
| 309 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
| 310 | unsigned long delta_exec); | ||
| 295 | 311 | ||
| 296 | /************************************************************** | 312 | /************************************************************** |
| 297 | * Scheduling class tree data structure manipulation methods: | 313 | * Scheduling class tree data structure manipulation methods: |
| @@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
| 583 | cpuacct_charge(curtask, delta_exec); | 599 | cpuacct_charge(curtask, delta_exec); |
| 584 | account_group_exec_runtime(curtask, delta_exec); | 600 | account_group_exec_runtime(curtask, delta_exec); |
| 585 | } | 601 | } |
| 602 | |||
| 603 | account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
| 586 | } | 604 | } |
| 587 | 605 | ||
| 588 | static inline void | 606 | static inline void |
| @@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 688 | } | 706 | } |
| 689 | 707 | ||
| 690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 708 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 709 | /* we need this in update_cfs_load and load-balance functions below */ | ||
| 710 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
| 691 | # ifdef CONFIG_SMP | 711 | # ifdef CONFIG_SMP |
| 692 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 712 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
| 693 | int global_update) | 713 | int global_update) |
| @@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
| 710 | u64 now, delta; | 730 | u64 now, delta; |
| 711 | unsigned long load = cfs_rq->load.weight; | 731 | unsigned long load = cfs_rq->load.weight; |
| 712 | 732 | ||
| 713 | if (cfs_rq->tg == &root_task_group) | 733 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) |
| 714 | return; | 734 | return; |
| 715 | 735 | ||
| 716 | now = rq_of(cfs_rq)->clock_task; | 736 | now = rq_of(cfs_rq)->clock_task; |
| @@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
| 819 | 839 | ||
| 820 | tg = cfs_rq->tg; | 840 | tg = cfs_rq->tg; |
| 821 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 841 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
| 822 | if (!se) | 842 | if (!se || throttled_hierarchy(cfs_rq)) |
| 823 | return; | 843 | return; |
| 824 | #ifndef CONFIG_SMP | 844 | #ifndef CONFIG_SMP |
| 825 | if (likely(se->load.weight == tg->shares)) | 845 | if (likely(se->load.weight == tg->shares)) |
| @@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 950 | se->vruntime = vruntime; | 970 | se->vruntime = vruntime; |
| 951 | } | 971 | } |
| 952 | 972 | ||
| 973 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||
| 974 | |||
| 953 | static void | 975 | static void |
| 954 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 976 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 955 | { | 977 | { |
| @@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 979 | __enqueue_entity(cfs_rq, se); | 1001 | __enqueue_entity(cfs_rq, se); |
| 980 | se->on_rq = 1; | 1002 | se->on_rq = 1; |
| 981 | 1003 | ||
| 982 | if (cfs_rq->nr_running == 1) | 1004 | if (cfs_rq->nr_running == 1) { |
| 983 | list_add_leaf_cfs_rq(cfs_rq); | 1005 | list_add_leaf_cfs_rq(cfs_rq); |
| 1006 | check_enqueue_throttle(cfs_rq); | ||
| 1007 | } | ||
| 984 | } | 1008 | } |
| 985 | 1009 | ||
| 986 | static void __clear_buddies_last(struct sched_entity *se) | 1010 | static void __clear_buddies_last(struct sched_entity *se) |
| @@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1028 | __clear_buddies_skip(se); | 1052 | __clear_buddies_skip(se); |
| 1029 | } | 1053 | } |
| 1030 | 1054 | ||
| 1055 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
| 1056 | |||
| 1031 | static void | 1057 | static void |
| 1032 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1058 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 1033 | { | 1059 | { |
| @@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 1066 | if (!(flags & DEQUEUE_SLEEP)) | 1092 | if (!(flags & DEQUEUE_SLEEP)) |
| 1067 | se->vruntime -= cfs_rq->min_vruntime; | 1093 | se->vruntime -= cfs_rq->min_vruntime; |
| 1068 | 1094 | ||
| 1095 | /* return excess runtime on last dequeue */ | ||
| 1096 | return_cfs_rq_runtime(cfs_rq); | ||
| 1097 | |||
| 1069 | update_min_vruntime(cfs_rq); | 1098 | update_min_vruntime(cfs_rq); |
| 1070 | update_cfs_shares(cfs_rq); | 1099 | update_cfs_shares(cfs_rq); |
| 1071 | } | 1100 | } |
| @@ -1077,6 +1106,8 @@ static void | |||
| 1077 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 1106 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
| 1078 | { | 1107 | { |
| 1079 | unsigned long ideal_runtime, delta_exec; | 1108 | unsigned long ideal_runtime, delta_exec; |
| 1109 | struct sched_entity *se; | ||
| 1110 | s64 delta; | ||
| 1080 | 1111 | ||
| 1081 | ideal_runtime = sched_slice(cfs_rq, curr); | 1112 | ideal_runtime = sched_slice(cfs_rq, curr); |
| 1082 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 1113 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
| @@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
| 1095 | * narrow margin doesn't have to wait for a full slice. | 1126 | * narrow margin doesn't have to wait for a full slice. |
| 1096 | * This also mitigates buddy induced latencies under load. | 1127 | * This also mitigates buddy induced latencies under load. |
| 1097 | */ | 1128 | */ |
| 1098 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
| 1099 | return; | ||
| 1100 | |||
| 1101 | if (delta_exec < sysctl_sched_min_granularity) | 1129 | if (delta_exec < sysctl_sched_min_granularity) |
| 1102 | return; | 1130 | return; |
| 1103 | 1131 | ||
| 1104 | if (cfs_rq->nr_running > 1) { | 1132 | se = __pick_first_entity(cfs_rq); |
| 1105 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 1133 | delta = curr->vruntime - se->vruntime; |
| 1106 | s64 delta = curr->vruntime - se->vruntime; | ||
| 1107 | 1134 | ||
| 1108 | if (delta < 0) | 1135 | if (delta < 0) |
| 1109 | return; | 1136 | return; |
| 1110 | 1137 | ||
| 1111 | if (delta > ideal_runtime) | 1138 | if (delta > ideal_runtime) |
| 1112 | resched_task(rq_of(cfs_rq)->curr); | 1139 | resched_task(rq_of(cfs_rq)->curr); |
| 1113 | } | ||
| 1114 | } | 1140 | } |
| 1115 | 1141 | ||
| 1116 | static void | 1142 | static void |
| @@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 1185 | return se; | 1211 | return se; |
| 1186 | } | 1212 | } |
| 1187 | 1213 | ||
| 1214 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
| 1215 | |||
| 1188 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 1216 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
| 1189 | { | 1217 | { |
| 1190 | /* | 1218 | /* |
| @@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
| 1194 | if (prev->on_rq) | 1222 | if (prev->on_rq) |
| 1195 | update_curr(cfs_rq); | 1223 | update_curr(cfs_rq); |
| 1196 | 1224 | ||
| 1225 | /* throttle cfs_rqs exceeding runtime */ | ||
| 1226 | check_cfs_rq_runtime(cfs_rq); | ||
| 1227 | |||
| 1197 | check_spread(cfs_rq, prev); | 1228 | check_spread(cfs_rq, prev); |
| 1198 | if (prev->on_rq) { | 1229 | if (prev->on_rq) { |
| 1199 | update_stats_wait_start(cfs_rq, prev); | 1230 | update_stats_wait_start(cfs_rq, prev); |
| @@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 1233 | return; | 1264 | return; |
| 1234 | #endif | 1265 | #endif |
| 1235 | 1266 | ||
| 1236 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 1267 | if (cfs_rq->nr_running > 1) |
| 1237 | check_preempt_tick(cfs_rq, curr); | 1268 | check_preempt_tick(cfs_rq, curr); |
| 1238 | } | 1269 | } |
| 1239 | 1270 | ||
| 1271 | |||
| 1272 | /************************************************** | ||
| 1273 | * CFS bandwidth control machinery | ||
| 1274 | */ | ||
| 1275 | |||
| 1276 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 1277 | /* | ||
| 1278 | * default period for cfs group bandwidth. | ||
| 1279 | * default: 0.1s, units: nanoseconds | ||
| 1280 | */ | ||
| 1281 | static inline u64 default_cfs_period(void) | ||
| 1282 | { | ||
| 1283 | return 100000000ULL; | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | static inline u64 sched_cfs_bandwidth_slice(void) | ||
| 1287 | { | ||
| 1288 | return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | /* | ||
| 1292 | * Replenish runtime according to assigned quota and update expiration time. | ||
| 1293 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | ||
| 1294 | * additional synchronization around rq->lock. | ||
| 1295 | * | ||
| 1296 | * requires cfs_b->lock | ||
| 1297 | */ | ||
| 1298 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | ||
| 1299 | { | ||
| 1300 | u64 now; | ||
| 1301 | |||
| 1302 | if (cfs_b->quota == RUNTIME_INF) | ||
| 1303 | return; | ||
| 1304 | |||
| 1305 | now = sched_clock_cpu(smp_processor_id()); | ||
| 1306 | cfs_b->runtime = cfs_b->quota; | ||
| 1307 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | ||
| 1308 | } | ||
| 1309 | |||
| 1310 | /* returns 0 on failure to allocate runtime */ | ||
| 1311 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 1312 | { | ||
| 1313 | struct task_group *tg = cfs_rq->tg; | ||
| 1314 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
| 1315 | u64 amount = 0, min_amount, expires; | ||
| 1316 | |||
| 1317 | /* note: this is a positive sum as runtime_remaining <= 0 */ | ||
| 1318 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | ||
| 1319 | |||
| 1320 | raw_spin_lock(&cfs_b->lock); | ||
| 1321 | if (cfs_b->quota == RUNTIME_INF) | ||
| 1322 | amount = min_amount; | ||
| 1323 | else { | ||
| 1324 | /* | ||
| 1325 | * If the bandwidth pool has become inactive, then at least one | ||
| 1326 | * period must have elapsed since the last consumption. | ||
| 1327 | * Refresh the global state and ensure bandwidth timer becomes | ||
| 1328 | * active. | ||
| 1329 | */ | ||
| 1330 | if (!cfs_b->timer_active) { | ||
| 1331 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
| 1332 | __start_cfs_bandwidth(cfs_b); | ||
| 1333 | } | ||
| 1334 | |||
| 1335 | if (cfs_b->runtime > 0) { | ||
| 1336 | amount = min(cfs_b->runtime, min_amount); | ||
| 1337 | cfs_b->runtime -= amount; | ||
| 1338 | cfs_b->idle = 0; | ||
| 1339 | } | ||
| 1340 | } | ||
| 1341 | expires = cfs_b->runtime_expires; | ||
| 1342 | raw_spin_unlock(&cfs_b->lock); | ||
| 1343 | |||
| 1344 | cfs_rq->runtime_remaining += amount; | ||
| 1345 | /* | ||
| 1346 | * we may have advanced our local expiration to account for allowed | ||
| 1347 | * spread between our sched_clock and the one on which runtime was | ||
| 1348 | * issued. | ||
| 1349 | */ | ||
| 1350 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | ||
| 1351 | cfs_rq->runtime_expires = expires; | ||
| 1352 | |||
| 1353 | return cfs_rq->runtime_remaining > 0; | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | /* | ||
| 1357 | * Note: This depends on the synchronization provided by sched_clock and the | ||
| 1358 | * fact that rq->clock snapshots this value. | ||
| 1359 | */ | ||
| 1360 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 1361 | { | ||
| 1362 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 1363 | struct rq *rq = rq_of(cfs_rq); | ||
| 1364 | |||
| 1365 | /* if the deadline is ahead of our clock, nothing to do */ | ||
| 1366 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | ||
| 1367 | return; | ||
| 1368 | |||
| 1369 | if (cfs_rq->runtime_remaining < 0) | ||
| 1370 | return; | ||
| 1371 | |||
| 1372 | /* | ||
| 1373 | * If the local deadline has passed we have to consider the | ||
| 1374 | * possibility that our sched_clock is 'fast' and the global deadline | ||
| 1375 | * has not truly expired. | ||
| 1376 | * | ||
| 1377 | * Fortunately we can check determine whether this the case by checking | ||
| 1378 | * whether the global deadline has advanced. | ||
| 1379 | */ | ||
| 1380 | |||
| 1381 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | ||
| 1382 | /* extend local deadline, drift is bounded above by 2 ticks */ | ||
| 1383 | cfs_rq->runtime_expires += TICK_NSEC; | ||
| 1384 | } else { | ||
| 1385 | /* global deadline is ahead, expiration has passed */ | ||
| 1386 | cfs_rq->runtime_remaining = 0; | ||
| 1387 | } | ||
| 1388 | } | ||
| 1389 | |||
| 1390 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
| 1391 | unsigned long delta_exec) | ||
| 1392 | { | ||
| 1393 | /* dock delta_exec before expiring quota (as it could span periods) */ | ||
| 1394 | cfs_rq->runtime_remaining -= delta_exec; | ||
| 1395 | expire_cfs_rq_runtime(cfs_rq); | ||
| 1396 | |||
| 1397 | if (likely(cfs_rq->runtime_remaining > 0)) | ||
| 1398 | return; | ||
| 1399 | |||
| 1400 | /* | ||
| 1401 | * if we're unable to extend our runtime we resched so that the active | ||
| 1402 | * hierarchy can be throttled | ||
| 1403 | */ | ||
| 1404 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | ||
| 1405 | resched_task(rq_of(cfs_rq)->curr); | ||
| 1406 | } | ||
| 1407 | |||
| 1408 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
| 1409 | unsigned long delta_exec) | ||
| 1410 | { | ||
| 1411 | if (!cfs_rq->runtime_enabled) | ||
| 1412 | return; | ||
| 1413 | |||
| 1414 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
| 1418 | { | ||
| 1419 | return cfs_rq->throttled; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | /* check whether cfs_rq, or any parent, is throttled */ | ||
| 1423 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
| 1424 | { | ||
| 1425 | return cfs_rq->throttle_count; | ||
| 1426 | } | ||
| 1427 | |||
| 1428 | /* | ||
| 1429 | * Ensure that neither of the group entities corresponding to src_cpu or | ||
| 1430 | * dest_cpu are members of a throttled hierarchy when performing group | ||
| 1431 | * load-balance operations. | ||
| 1432 | */ | ||
| 1433 | static inline int throttled_lb_pair(struct task_group *tg, | ||
| 1434 | int src_cpu, int dest_cpu) | ||
| 1435 | { | ||
| 1436 | struct cfs_rq *src_cfs_rq, *dest_cfs_rq; | ||
| 1437 | |||
| 1438 | src_cfs_rq = tg->cfs_rq[src_cpu]; | ||
| 1439 | dest_cfs_rq = tg->cfs_rq[dest_cpu]; | ||
| 1440 | |||
| 1441 | return throttled_hierarchy(src_cfs_rq) || | ||
| 1442 | throttled_hierarchy(dest_cfs_rq); | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
| 1446 | static int tg_unthrottle_up(struct task_group *tg, void *data) | ||
| 1447 | { | ||
| 1448 | struct rq *rq = data; | ||
| 1449 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
| 1450 | |||
| 1451 | cfs_rq->throttle_count--; | ||
| 1452 | #ifdef CONFIG_SMP | ||
| 1453 | if (!cfs_rq->throttle_count) { | ||
| 1454 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | ||
| 1455 | |||
| 1456 | /* leaving throttled state, advance shares averaging windows */ | ||
| 1457 | cfs_rq->load_stamp += delta; | ||
| 1458 | cfs_rq->load_last += delta; | ||
| 1459 | |||
| 1460 | /* update entity weight now that we are on_rq again */ | ||
| 1461 | update_cfs_shares(cfs_rq); | ||
| 1462 | } | ||
| 1463 | #endif | ||
| 1464 | |||
| 1465 | return 0; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | static int tg_throttle_down(struct task_group *tg, void *data) | ||
| 1469 | { | ||
| 1470 | struct rq *rq = data; | ||
| 1471 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
| 1472 | |||
| 1473 | /* group is entering throttled state, record last load */ | ||
| 1474 | if (!cfs_rq->throttle_count) | ||
| 1475 | update_cfs_load(cfs_rq, 0); | ||
| 1476 | cfs_rq->throttle_count++; | ||
| 1477 | |||
| 1478 | return 0; | ||
| 1479 | } | ||
| 1480 | |||
| 1481 | static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 1482 | { | ||
| 1483 | struct rq *rq = rq_of(cfs_rq); | ||
| 1484 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 1485 | struct sched_entity *se; | ||
| 1486 | long task_delta, dequeue = 1; | ||
| 1487 | |||
| 1488 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
| 1489 | |||
| 1490 | /* account load preceding throttle */ | ||
| 1491 | rcu_read_lock(); | ||
| 1492 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | ||
| 1493 | rcu_read_unlock(); | ||
| 1494 | |||
| 1495 | task_delta = cfs_rq->h_nr_running; | ||
| 1496 | for_each_sched_entity(se) { | ||
| 1497 | struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||
| 1498 | /* throttled entity or throttle-on-deactivate */ | ||
| 1499 | if (!se->on_rq) | ||
| 1500 | break; | ||
| 1501 | |||
| 1502 | if (dequeue) | ||
| 1503 | dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||
| 1504 | qcfs_rq->h_nr_running -= task_delta; | ||
| 1505 | |||
| 1506 | if (qcfs_rq->load.weight) | ||
| 1507 | dequeue = 0; | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | if (!se) | ||
| 1511 | rq->nr_running -= task_delta; | ||
| 1512 | |||
| 1513 | cfs_rq->throttled = 1; | ||
| 1514 | cfs_rq->throttled_timestamp = rq->clock; | ||
| 1515 | raw_spin_lock(&cfs_b->lock); | ||
| 1516 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
| 1517 | raw_spin_unlock(&cfs_b->lock); | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 1521 | { | ||
| 1522 | struct rq *rq = rq_of(cfs_rq); | ||
| 1523 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 1524 | struct sched_entity *se; | ||
| 1525 | int enqueue = 1; | ||
| 1526 | long task_delta; | ||
| 1527 | |||
| 1528 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
| 1529 | |||
| 1530 | cfs_rq->throttled = 0; | ||
| 1531 | raw_spin_lock(&cfs_b->lock); | ||
| 1532 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | ||
| 1533 | list_del_rcu(&cfs_rq->throttled_list); | ||
| 1534 | raw_spin_unlock(&cfs_b->lock); | ||
| 1535 | cfs_rq->throttled_timestamp = 0; | ||
| 1536 | |||
| 1537 | update_rq_clock(rq); | ||
| 1538 | /* update hierarchical throttle state */ | ||
| 1539 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | ||
| 1540 | |||
| 1541 | if (!cfs_rq->load.weight) | ||
| 1542 | return; | ||
| 1543 | |||
| 1544 | task_delta = cfs_rq->h_nr_running; | ||
| 1545 | for_each_sched_entity(se) { | ||
| 1546 | if (se->on_rq) | ||
| 1547 | enqueue = 0; | ||
| 1548 | |||
| 1549 | cfs_rq = cfs_rq_of(se); | ||
| 1550 | if (enqueue) | ||
| 1551 | enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); | ||
| 1552 | cfs_rq->h_nr_running += task_delta; | ||
| 1553 | |||
| 1554 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1555 | break; | ||
| 1556 | } | ||
| 1557 | |||
| 1558 | if (!se) | ||
| 1559 | rq->nr_running += task_delta; | ||
| 1560 | |||
| 1561 | /* determine whether we need to wake up potentially idle cpu */ | ||
| 1562 | if (rq->curr == rq->idle && rq->cfs.nr_running) | ||
| 1563 | resched_task(rq->curr); | ||
| 1564 | } | ||
| 1565 | |||
| 1566 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | ||
| 1567 | u64 remaining, u64 expires) | ||
| 1568 | { | ||
| 1569 | struct cfs_rq *cfs_rq; | ||
| 1570 | u64 runtime = remaining; | ||
| 1571 | |||
| 1572 | rcu_read_lock(); | ||
| 1573 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | ||
| 1574 | throttled_list) { | ||
| 1575 | struct rq *rq = rq_of(cfs_rq); | ||
| 1576 | |||
| 1577 | raw_spin_lock(&rq->lock); | ||
| 1578 | if (!cfs_rq_throttled(cfs_rq)) | ||
| 1579 | goto next; | ||
| 1580 | |||
| 1581 | runtime = -cfs_rq->runtime_remaining + 1; | ||
| 1582 | if (runtime > remaining) | ||
| 1583 | runtime = remaining; | ||
| 1584 | remaining -= runtime; | ||
| 1585 | |||
| 1586 | cfs_rq->runtime_remaining += runtime; | ||
| 1587 | cfs_rq->runtime_expires = expires; | ||
| 1588 | |||
| 1589 | /* we check whether we're throttled above */ | ||
| 1590 | if (cfs_rq->runtime_remaining > 0) | ||
| 1591 | unthrottle_cfs_rq(cfs_rq); | ||
| 1592 | |||
| 1593 | next: | ||
| 1594 | raw_spin_unlock(&rq->lock); | ||
| 1595 | |||
| 1596 | if (!remaining) | ||
| 1597 | break; | ||
| 1598 | } | ||
| 1599 | rcu_read_unlock(); | ||
| 1600 | |||
| 1601 | return remaining; | ||
| 1602 | } | ||
| 1603 | |||
| 1604 | /* | ||
| 1605 | * Responsible for refilling a task_group's bandwidth and unthrottling its | ||
| 1606 | * cfs_rqs as appropriate. If there has been no activity within the last | ||
| 1607 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | ||
| 1608 | * used to track this state. | ||
| 1609 | */ | ||
| 1610 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | ||
| 1611 | { | ||
| 1612 | u64 runtime, runtime_expires; | ||
| 1613 | int idle = 1, throttled; | ||
| 1614 | |||
| 1615 | raw_spin_lock(&cfs_b->lock); | ||
| 1616 | /* no need to continue the timer with no bandwidth constraint */ | ||
| 1617 | if (cfs_b->quota == RUNTIME_INF) | ||
| 1618 | goto out_unlock; | ||
| 1619 | |||
| 1620 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
| 1621 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
| 1622 | idle = cfs_b->idle && !throttled; | ||
| 1623 | cfs_b->nr_periods += overrun; | ||
| 1624 | |||
| 1625 | /* if we're going inactive then everything else can be deferred */ | ||
| 1626 | if (idle) | ||
| 1627 | goto out_unlock; | ||
| 1628 | |||
| 1629 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
| 1630 | |||
| 1631 | if (!throttled) { | ||
| 1632 | /* mark as potentially idle for the upcoming period */ | ||
| 1633 | cfs_b->idle = 1; | ||
| 1634 | goto out_unlock; | ||
| 1635 | } | ||
| 1636 | |||
| 1637 | /* account preceding periods in which throttling occurred */ | ||
| 1638 | cfs_b->nr_throttled += overrun; | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * There are throttled entities so we must first use the new bandwidth | ||
| 1642 | * to unthrottle them before making it generally available. This | ||
| 1643 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
| 1644 | * allowed to run. | ||
| 1645 | */ | ||
| 1646 | runtime = cfs_b->runtime; | ||
| 1647 | runtime_expires = cfs_b->runtime_expires; | ||
| 1648 | cfs_b->runtime = 0; | ||
| 1649 | |||
| 1650 | /* | ||
| 1651 | * This check is repeated as we are holding onto the new bandwidth | ||
| 1652 | * while we unthrottle. This can potentially race with an unthrottled | ||
| 1653 | * group trying to acquire new bandwidth from the global pool. | ||
| 1654 | */ | ||
| 1655 | while (throttled && runtime > 0) { | ||
| 1656 | raw_spin_unlock(&cfs_b->lock); | ||
| 1657 | /* we can't nest cfs_b->lock while distributing bandwidth */ | ||
| 1658 | runtime = distribute_cfs_runtime(cfs_b, runtime, | ||
| 1659 | runtime_expires); | ||
| 1660 | raw_spin_lock(&cfs_b->lock); | ||
| 1661 | |||
| 1662 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
| 1663 | } | ||
| 1664 | |||
| 1665 | /* return (any) remaining runtime */ | ||
| 1666 | cfs_b->runtime = runtime; | ||
| 1667 | /* | ||
| 1668 | * While we are ensured activity in the period following an | ||
| 1669 | * unthrottle, this also covers the case in which the new bandwidth is | ||
| 1670 | * insufficient to cover the existing bandwidth deficit. (Forcing the | ||
| 1671 | * timer to remain active while there are any throttled entities.) | ||
| 1672 | */ | ||
| 1673 | cfs_b->idle = 0; | ||
| 1674 | out_unlock: | ||
| 1675 | if (idle) | ||
| 1676 | cfs_b->timer_active = 0; | ||
| 1677 | raw_spin_unlock(&cfs_b->lock); | ||
| 1678 | |||
| 1679 | return idle; | ||
| 1680 | } | ||
| 1681 | |||
| 1682 | /* a cfs_rq won't donate quota below this amount */ | ||
| 1683 | static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; | ||
| 1684 | /* minimum remaining period time to redistribute slack quota */ | ||
| 1685 | static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | ||
| 1686 | /* how long we wait to gather additional slack before distributing */ | ||
| 1687 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | ||
| 1688 | |||
| 1689 | /* are we near the end of the current quota period? */ | ||
| 1690 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | ||
| 1691 | { | ||
| 1692 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | ||
| 1693 | u64 remaining; | ||
| 1694 | |||
| 1695 | /* if the call-back is running a quota refresh is already occurring */ | ||
| 1696 | if (hrtimer_callback_running(refresh_timer)) | ||
| 1697 | return 1; | ||
| 1698 | |||
| 1699 | /* is a quota refresh about to occur? */ | ||
| 1700 | remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); | ||
| 1701 | if (remaining < min_expire) | ||
| 1702 | return 1; | ||
| 1703 | |||
| 1704 | return 0; | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 1708 | { | ||
| 1709 | u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; | ||
| 1710 | |||
| 1711 | /* if there's a quota refresh soon don't bother with slack */ | ||
| 1712 | if (runtime_refresh_within(cfs_b, min_left)) | ||
| 1713 | return; | ||
| 1714 | |||
| 1715 | start_bandwidth_timer(&cfs_b->slack_timer, | ||
| 1716 | ns_to_ktime(cfs_bandwidth_slack_period)); | ||
| 1717 | } | ||
| 1718 | |||
| 1719 | /* we know any runtime found here is valid as update_curr() precedes return */ | ||
| 1720 | static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 1721 | { | ||
| 1722 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 1723 | s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; | ||
| 1724 | |||
| 1725 | if (slack_runtime <= 0) | ||
| 1726 | return; | ||
| 1727 | |||
| 1728 | raw_spin_lock(&cfs_b->lock); | ||
| 1729 | if (cfs_b->quota != RUNTIME_INF && | ||
| 1730 | cfs_rq->runtime_expires == cfs_b->runtime_expires) { | ||
| 1731 | cfs_b->runtime += slack_runtime; | ||
| 1732 | |||
| 1733 | /* we are under rq->lock, defer unthrottling using a timer */ | ||
| 1734 | if (cfs_b->runtime > sched_cfs_bandwidth_slice() && | ||
| 1735 | !list_empty(&cfs_b->throttled_cfs_rq)) | ||
| 1736 | start_cfs_slack_bandwidth(cfs_b); | ||
| 1737 | } | ||
| 1738 | raw_spin_unlock(&cfs_b->lock); | ||
| 1739 | |||
| 1740 | /* even if it's not valid for return we don't want to try again */ | ||
| 1741 | cfs_rq->runtime_remaining -= slack_runtime; | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 1745 | { | ||
| 1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | ||
| 1747 | return; | ||
| 1748 | |||
| 1749 | __return_cfs_rq_runtime(cfs_rq); | ||
| 1750 | } | ||
| 1751 | |||
| 1752 | /* | ||
| 1753 | * This is done with a timer (instead of inline with bandwidth return) since | ||
| 1754 | * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. | ||
| 1755 | */ | ||
| 1756 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||
| 1757 | { | ||
| 1758 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | ||
| 1759 | u64 expires; | ||
| 1760 | |||
| 1761 | /* confirm we're still not at a refresh boundary */ | ||
| 1762 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | ||
| 1763 | return; | ||
| 1764 | |||
| 1765 | raw_spin_lock(&cfs_b->lock); | ||
| 1766 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | ||
| 1767 | runtime = cfs_b->runtime; | ||
| 1768 | cfs_b->runtime = 0; | ||
| 1769 | } | ||
| 1770 | expires = cfs_b->runtime_expires; | ||
| 1771 | raw_spin_unlock(&cfs_b->lock); | ||
| 1772 | |||
| 1773 | if (!runtime) | ||
| 1774 | return; | ||
| 1775 | |||
| 1776 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | ||
| 1777 | |||
| 1778 | raw_spin_lock(&cfs_b->lock); | ||
| 1779 | if (expires == cfs_b->runtime_expires) | ||
| 1780 | cfs_b->runtime = runtime; | ||
| 1781 | raw_spin_unlock(&cfs_b->lock); | ||
| 1782 | } | ||
| 1783 | |||
| 1784 | /* | ||
| 1785 | * When a group wakes up we want to make sure that its quota is not already | ||
| 1786 | * expired/exceeded, otherwise it may be allowed to steal additional ticks of | ||
| 1787 | * runtime as update_curr() throttling can not not trigger until it's on-rq. | ||
| 1788 | */ | ||
| 1789 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | ||
| 1790 | { | ||
| 1791 | /* an active group must be handled by the update_curr()->put() path */ | ||
| 1792 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | ||
| 1793 | return; | ||
| 1794 | |||
| 1795 | /* ensure the group is not already throttled */ | ||
| 1796 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1797 | return; | ||
| 1798 | |||
| 1799 | /* update runtime allocation */ | ||
| 1800 | account_cfs_rq_runtime(cfs_rq, 0); | ||
| 1801 | if (cfs_rq->runtime_remaining <= 0) | ||
| 1802 | throttle_cfs_rq(cfs_rq); | ||
| 1803 | } | ||
| 1804 | |||
| 1805 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | ||
| 1806 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 1807 | { | ||
| 1808 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | ||
| 1809 | return; | ||
| 1810 | |||
| 1811 | /* | ||
| 1812 | * it's possible for a throttled entity to be forced into a running | ||
| 1813 | * state (e.g. set_curr_task), in this case we're finished. | ||
| 1814 | */ | ||
| 1815 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1816 | return; | ||
| 1817 | |||
| 1818 | throttle_cfs_rq(cfs_rq); | ||
| 1819 | } | ||
| 1820 | #else | ||
| 1821 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
| 1822 | unsigned long delta_exec) {} | ||
| 1823 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
| 1824 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | ||
| 1825 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
| 1826 | |||
| 1827 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
| 1828 | { | ||
| 1829 | return 0; | ||
| 1830 | } | ||
| 1831 | |||
| 1832 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
| 1833 | { | ||
| 1834 | return 0; | ||
| 1835 | } | ||
| 1836 | |||
| 1837 | static inline int throttled_lb_pair(struct task_group *tg, | ||
| 1838 | int src_cpu, int dest_cpu) | ||
| 1839 | { | ||
| 1840 | return 0; | ||
| 1841 | } | ||
| 1842 | #endif | ||
| 1843 | |||
| 1240 | /************************************************** | 1844 | /************************************************** |
| 1241 | * CFS operations on tasks: | 1845 | * CFS operations on tasks: |
| 1242 | */ | 1846 | */ |
| @@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1313 | break; | 1917 | break; |
| 1314 | cfs_rq = cfs_rq_of(se); | 1918 | cfs_rq = cfs_rq_of(se); |
| 1315 | enqueue_entity(cfs_rq, se, flags); | 1919 | enqueue_entity(cfs_rq, se, flags); |
| 1920 | |||
| 1921 | /* | ||
| 1922 | * end evaluation on encountering a throttled cfs_rq | ||
| 1923 | * | ||
| 1924 | * note: in the case of encountering a throttled cfs_rq we will | ||
| 1925 | * post the final h_nr_running increment below. | ||
| 1926 | */ | ||
| 1927 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1928 | break; | ||
| 1929 | cfs_rq->h_nr_running++; | ||
| 1930 | |||
| 1316 | flags = ENQUEUE_WAKEUP; | 1931 | flags = ENQUEUE_WAKEUP; |
| 1317 | } | 1932 | } |
| 1318 | 1933 | ||
| 1319 | for_each_sched_entity(se) { | 1934 | for_each_sched_entity(se) { |
| 1320 | cfs_rq = cfs_rq_of(se); | 1935 | cfs_rq = cfs_rq_of(se); |
| 1936 | cfs_rq->h_nr_running++; | ||
| 1937 | |||
| 1938 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1939 | break; | ||
| 1321 | 1940 | ||
| 1322 | update_cfs_load(cfs_rq, 0); | 1941 | update_cfs_load(cfs_rq, 0); |
| 1323 | update_cfs_shares(cfs_rq); | 1942 | update_cfs_shares(cfs_rq); |
| 1324 | } | 1943 | } |
| 1325 | 1944 | ||
| 1945 | if (!se) | ||
| 1946 | inc_nr_running(rq); | ||
| 1326 | hrtick_update(rq); | 1947 | hrtick_update(rq); |
| 1327 | } | 1948 | } |
| 1328 | 1949 | ||
| @@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1343 | cfs_rq = cfs_rq_of(se); | 1964 | cfs_rq = cfs_rq_of(se); |
| 1344 | dequeue_entity(cfs_rq, se, flags); | 1965 | dequeue_entity(cfs_rq, se, flags); |
| 1345 | 1966 | ||
| 1967 | /* | ||
| 1968 | * end evaluation on encountering a throttled cfs_rq | ||
| 1969 | * | ||
| 1970 | * note: in the case of encountering a throttled cfs_rq we will | ||
| 1971 | * post the final h_nr_running decrement below. | ||
| 1972 | */ | ||
| 1973 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1974 | break; | ||
| 1975 | cfs_rq->h_nr_running--; | ||
| 1976 | |||
| 1346 | /* Don't dequeue parent if it has other entities besides us */ | 1977 | /* Don't dequeue parent if it has other entities besides us */ |
| 1347 | if (cfs_rq->load.weight) { | 1978 | if (cfs_rq->load.weight) { |
| 1348 | /* | 1979 | /* |
| @@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1361 | 1992 | ||
| 1362 | for_each_sched_entity(se) { | 1993 | for_each_sched_entity(se) { |
| 1363 | cfs_rq = cfs_rq_of(se); | 1994 | cfs_rq = cfs_rq_of(se); |
| 1995 | cfs_rq->h_nr_running--; | ||
| 1996 | |||
| 1997 | if (cfs_rq_throttled(cfs_rq)) | ||
| 1998 | break; | ||
| 1364 | 1999 | ||
| 1365 | update_cfs_load(cfs_rq, 0); | 2000 | update_cfs_load(cfs_rq, 0); |
| 1366 | update_cfs_shares(cfs_rq); | 2001 | update_cfs_shares(cfs_rq); |
| 1367 | } | 2002 | } |
| 1368 | 2003 | ||
| 2004 | if (!se) | ||
| 2005 | dec_nr_running(rq); | ||
| 1369 | hrtick_update(rq); | 2006 | hrtick_update(rq); |
| 1370 | } | 2007 | } |
| 1371 | 2008 | ||
| @@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 1434 | 2071 | ||
| 1435 | return wl; | 2072 | return wl; |
| 1436 | } | 2073 | } |
| 1437 | |||
| 1438 | #else | 2074 | #else |
| 1439 | 2075 | ||
| 1440 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 2076 | static inline unsigned long effective_load(struct task_group *tg, int cpu, |
| @@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 1547 | 2183 | ||
| 1548 | /* Skip over this group if it has no CPUs allowed */ | 2184 | /* Skip over this group if it has no CPUs allowed */ |
| 1549 | if (!cpumask_intersects(sched_group_cpus(group), | 2185 | if (!cpumask_intersects(sched_group_cpus(group), |
| 1550 | &p->cpus_allowed)) | 2186 | tsk_cpus_allowed(p))) |
| 1551 | continue; | 2187 | continue; |
| 1552 | 2188 | ||
| 1553 | local_group = cpumask_test_cpu(this_cpu, | 2189 | local_group = cpumask_test_cpu(this_cpu, |
| @@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 1593 | int i; | 2229 | int i; |
| 1594 | 2230 | ||
| 1595 | /* Traverse only the allowed CPUs */ | 2231 | /* Traverse only the allowed CPUs */ |
| 1596 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | 2232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
| 1597 | load = weighted_cpuload(i); | 2233 | load = weighted_cpuload(i); |
| 1598 | 2234 | ||
| 1599 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2235 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| @@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 1637 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 2273 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
| 1638 | break; | 2274 | break; |
| 1639 | 2275 | ||
| 1640 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | 2276 | for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { |
| 1641 | if (idle_cpu(i)) { | 2277 | if (idle_cpu(i)) { |
| 1642 | target = i; | 2278 | target = i; |
| 1643 | break; | 2279 | break; |
| @@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 1680 | int sync = wake_flags & WF_SYNC; | 2316 | int sync = wake_flags & WF_SYNC; |
| 1681 | 2317 | ||
| 1682 | if (sd_flag & SD_BALANCE_WAKE) { | 2318 | if (sd_flag & SD_BALANCE_WAKE) { |
| 1683 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) | 2319 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 1684 | want_affine = 1; | 2320 | want_affine = 1; |
| 1685 | new_cpu = prev_cpu; | 2321 | new_cpu = prev_cpu; |
| 1686 | } | 2322 | } |
| @@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1875 | if (unlikely(se == pse)) | 2511 | if (unlikely(se == pse)) |
| 1876 | return; | 2512 | return; |
| 1877 | 2513 | ||
| 2514 | /* | ||
| 2515 | * This is possible from callers such as pull_task(), in which we | ||
| 2516 | * unconditionally check_prempt_curr() after an enqueue (which may have | ||
| 2517 | * lead to a throttle). This both saves work and prevents false | ||
| 2518 | * next-buddy nomination below. | ||
| 2519 | */ | ||
| 2520 | if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) | ||
| 2521 | return; | ||
| 2522 | |||
| 1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { | 2523 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
| 1879 | set_next_buddy(pse); | 2524 | set_next_buddy(pse); |
| 1880 | next_buddy_marked = 1; | 2525 | next_buddy_marked = 1; |
| @@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1883 | /* | 2528 | /* |
| 1884 | * We can come here with TIF_NEED_RESCHED already set from new task | 2529 | * We can come here with TIF_NEED_RESCHED already set from new task |
| 1885 | * wake up path. | 2530 | * wake up path. |
| 2531 | * | ||
| 2532 | * Note: this also catches the edge-case of curr being in a throttled | ||
| 2533 | * group (e.g. via set_curr_task), since update_curr() (in the | ||
| 2534 | * enqueue of curr) will have resulted in resched being set. This | ||
| 2535 | * prevents us from potentially nominating it as a false LAST_BUDDY | ||
| 2536 | * below. | ||
| 1886 | */ | 2537 | */ |
| 1887 | if (test_tsk_need_resched(curr)) | 2538 | if (test_tsk_need_resched(curr)) |
| 1888 | return; | 2539 | return; |
| @@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1899 | if (unlikely(p->policy != SCHED_NORMAL)) | 2550 | if (unlikely(p->policy != SCHED_NORMAL)) |
| 1900 | return; | 2551 | return; |
| 1901 | 2552 | ||
| 1902 | |||
| 1903 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
| 1904 | return; | ||
| 1905 | |||
| 1906 | find_matching_se(&se, &pse); | 2553 | find_matching_se(&se, &pse); |
| 1907 | update_curr(cfs_rq_of(se)); | 2554 | update_curr(cfs_rq_of(se)); |
| 1908 | BUG_ON(!pse); | 2555 | BUG_ON(!pse); |
| @@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 2005 | { | 2652 | { |
| 2006 | struct sched_entity *se = &p->se; | 2653 | struct sched_entity *se = &p->se; |
| 2007 | 2654 | ||
| 2008 | if (!se->on_rq) | 2655 | /* throttled hierarchies are not runnable */ |
| 2656 | if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) | ||
| 2009 | return false; | 2657 | return false; |
| 2010 | 2658 | ||
| 2011 | /* Tell the scheduler that we'd really like pse to run next. */ | 2659 | /* Tell the scheduler that we'd really like pse to run next. */ |
| @@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 2049 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2697 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
| 2050 | * 3) are cache-hot on their current CPU. | 2698 | * 3) are cache-hot on their current CPU. |
| 2051 | */ | 2699 | */ |
| 2052 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | 2700 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { |
| 2053 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 2701 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
| 2054 | return 0; | 2702 | return 0; |
| 2055 | } | 2703 | } |
| @@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2102 | 2750 | ||
| 2103 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 2751 | for_each_leaf_cfs_rq(busiest, cfs_rq) { |
| 2104 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 2752 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { |
| 2753 | if (throttled_lb_pair(task_group(p), | ||
| 2754 | busiest->cpu, this_cpu)) | ||
| 2755 | break; | ||
| 2105 | 2756 | ||
| 2106 | if (!can_migrate_task(p, busiest, this_cpu, | 2757 | if (!can_migrate_task(p, busiest, this_cpu, |
| 2107 | sd, idle, &pinned)) | 2758 | sd, idle, &pinned)) |
| @@ -2217,8 +2868,13 @@ static void update_shares(int cpu) | |||
| 2217 | * Iterates the task_group tree in a bottom up fashion, see | 2868 | * Iterates the task_group tree in a bottom up fashion, see |
| 2218 | * list_add_leaf_cfs_rq() for details. | 2869 | * list_add_leaf_cfs_rq() for details. |
| 2219 | */ | 2870 | */ |
| 2220 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2871 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
| 2872 | /* throttled entities do not contribute to load */ | ||
| 2873 | if (throttled_hierarchy(cfs_rq)) | ||
| 2874 | continue; | ||
| 2875 | |||
| 2221 | update_shares_cpu(cfs_rq->tg, cpu); | 2876 | update_shares_cpu(cfs_rq->tg, cpu); |
| 2877 | } | ||
| 2222 | rcu_read_unlock(); | 2878 | rcu_read_unlock(); |
| 2223 | } | 2879 | } |
| 2224 | 2880 | ||
| @@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2268 | u64 rem_load, moved_load; | 2924 | u64 rem_load, moved_load; |
| 2269 | 2925 | ||
| 2270 | /* | 2926 | /* |
| 2271 | * empty group | 2927 | * empty group or part of a throttled hierarchy |
| 2272 | */ | 2928 | */ |
| 2273 | if (!busiest_cfs_rq->task_weight) | 2929 | if (!busiest_cfs_rq->task_weight || |
| 2930 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
| 2274 | continue; | 2931 | continue; |
| 2275 | 2932 | ||
| 2276 | rem_load = (u64)rem_load_move * busiest_weight; | 2933 | rem_load = (u64)rem_load_move * busiest_weight; |
| @@ -3430,7 +4087,7 @@ redo: | |||
| 3430 | * moved to this_cpu | 4087 | * moved to this_cpu |
| 3431 | */ | 4088 | */ |
| 3432 | if (!cpumask_test_cpu(this_cpu, | 4089 | if (!cpumask_test_cpu(this_cpu, |
| 3433 | &busiest->curr->cpus_allowed)) { | 4090 | tsk_cpus_allowed(busiest->curr))) { |
| 3434 | raw_spin_unlock_irqrestore(&busiest->lock, | 4091 | raw_spin_unlock_irqrestore(&busiest->lock, |
| 3435 | flags); | 4092 | flags); |
| 3436 | all_pinned = 1; | 4093 | all_pinned = 1; |
| @@ -3612,22 +4269,6 @@ out_unlock: | |||
| 3612 | } | 4269 | } |
| 3613 | 4270 | ||
| 3614 | #ifdef CONFIG_NO_HZ | 4271 | #ifdef CONFIG_NO_HZ |
| 3615 | |||
| 3616 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
| 3617 | |||
| 3618 | static void trigger_sched_softirq(void *data) | ||
| 3619 | { | ||
| 3620 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
| 3621 | } | ||
| 3622 | |||
| 3623 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
| 3624 | { | ||
| 3625 | csd->func = trigger_sched_softirq; | ||
| 3626 | csd->info = NULL; | ||
| 3627 | csd->flags = 0; | ||
| 3628 | csd->priv = 0; | ||
| 3629 | } | ||
| 3630 | |||
| 3631 | /* | 4272 | /* |
| 3632 | * idle load balancing details | 4273 | * idle load balancing details |
| 3633 | * - One of the idle CPUs nominates itself as idle load_balancer, while | 4274 | * - One of the idle CPUs nominates itself as idle load_balancer, while |
| @@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
| 3667 | struct sched_domain *sd; | 4308 | struct sched_domain *sd; |
| 3668 | 4309 | ||
| 3669 | for_each_domain(cpu, sd) | 4310 | for_each_domain(cpu, sd) |
| 3670 | if (sd && (sd->flags & flag)) | 4311 | if (sd->flags & flag) |
| 3671 | break; | 4312 | break; |
| 3672 | 4313 | ||
| 3673 | return sd; | 4314 | return sd; |
| @@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu) | |||
| 3793 | } | 4434 | } |
| 3794 | 4435 | ||
| 3795 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4436 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { |
| 3796 | struct call_single_data *cp; | ||
| 3797 | |||
| 3798 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4437 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; |
| 3799 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | 4438 | |
| 3800 | __smp_call_function_single(ilb_cpu, cp, 0); | 4439 | smp_mb(); |
| 4440 | /* | ||
| 4441 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
| 4442 | * This way we generate a sched IPI on the target cpu which | ||
| 4443 | * is idle. And the softirq performing nohz idle load balance | ||
| 4444 | * will be run before returning from the IPI. | ||
| 4445 | */ | ||
| 4446 | smp_send_reschedule(ilb_cpu); | ||
| 3801 | } | 4447 | } |
| 3802 | return; | 4448 | return; |
| 3803 | } | 4449 | } |
| @@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 4030 | if (time_before(now, nohz.next_balance)) | 4676 | if (time_before(now, nohz.next_balance)) |
| 4031 | return 0; | 4677 | return 0; |
| 4032 | 4678 | ||
| 4033 | if (rq->idle_at_tick) | 4679 | if (idle_cpu(cpu)) |
| 4034 | return 0; | 4680 | return 0; |
| 4035 | 4681 | ||
| 4036 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 4682 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); |
| @@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 4066 | { | 4712 | { |
| 4067 | int this_cpu = smp_processor_id(); | 4713 | int this_cpu = smp_processor_id(); |
| 4068 | struct rq *this_rq = cpu_rq(this_cpu); | 4714 | struct rq *this_rq = cpu_rq(this_cpu); |
| 4069 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 4715 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| 4070 | CPU_IDLE : CPU_NOT_IDLE; | 4716 | CPU_IDLE : CPU_NOT_IDLE; |
| 4071 | 4717 | ||
| 4072 | rebalance_domains(this_cpu, idle); | 4718 | rebalance_domains(this_cpu, idle); |
| @@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq) | |||
| 4251 | { | 4897 | { |
| 4252 | struct sched_entity *se = &rq->curr->se; | 4898 | struct sched_entity *se = &rq->curr->se; |
| 4253 | 4899 | ||
| 4254 | for_each_sched_entity(se) | 4900 | for_each_sched_entity(se) { |
| 4255 | set_next_entity(cfs_rq_of(se), se); | 4901 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 4902 | |||
| 4903 | set_next_entity(cfs_rq, se); | ||
| 4904 | /* ensure bandwidth has been allocated on our new cfs_rq */ | ||
| 4905 | account_cfs_rq_runtime(cfs_rq, 0); | ||
| 4906 | } | ||
| 4256 | } | 4907 | } |
| 4257 | 4908 | ||
| 4258 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb040..efa0a7b75dde 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | |||
| 12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, 1) |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Should wakeups try to preempt running tasks. | ||
| 16 | */ | ||
| 17 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
| 21 | * a newly woken task on the same cpu as the task that woke it -- | 16 | * a newly woken task on the same cpu as the task that woke it -- |
| 22 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index af1177858be3..056cbd2e2a27 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 124 | update_rt_migration(rt_rq); | 124 | update_rt_migration(rt_rq); |
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | static inline int has_pushable_tasks(struct rq *rq) | ||
| 128 | { | ||
| 129 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
| 130 | } | ||
| 131 | |||
| 127 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 132 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
| 128 | { | 133 | { |
| 129 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 134 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
| 130 | plist_node_init(&p->pushable_tasks, p->prio); | 135 | plist_node_init(&p->pushable_tasks, p->prio); |
| 131 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); |
| 137 | |||
| 138 | /* Update the highest prio pushable task */ | ||
| 139 | if (p->prio < rq->rt.highest_prio.next) | ||
| 140 | rq->rt.highest_prio.next = p->prio; | ||
| 132 | } | 141 | } |
| 133 | 142 | ||
| 134 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | 143 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) |
| 135 | { | 144 | { |
| 136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 145 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
| 137 | } | ||
| 138 | 146 | ||
| 139 | static inline int has_pushable_tasks(struct rq *rq) | 147 | /* Update the new highest prio pushable task */ |
| 140 | { | 148 | if (has_pushable_tasks(rq)) { |
| 141 | return !plist_head_empty(&rq->rt.pushable_tasks); | 149 | p = plist_first_entry(&rq->rt.pushable_tasks, |
| 150 | struct task_struct, pushable_tasks); | ||
| 151 | rq->rt.highest_prio.next = p->prio; | ||
| 152 | } else | ||
| 153 | rq->rt.highest_prio.next = MAX_RT_PRIO; | ||
| 142 | } | 154 | } |
| 143 | 155 | ||
| 144 | #else | 156 | #else |
| @@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 643 | 655 | ||
| 644 | if (rt_rq->rt_time > runtime) { | 656 | if (rt_rq->rt_time > runtime) { |
| 645 | rt_rq->rt_throttled = 1; | 657 | rt_rq->rt_throttled = 1; |
| 658 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | ||
| 646 | if (rt_rq_throttled(rt_rq)) { | 659 | if (rt_rq_throttled(rt_rq)) { |
| 647 | sched_rt_rq_dequeue(rt_rq); | 660 | sched_rt_rq_dequeue(rt_rq); |
| 648 | return 1; | 661 | return 1; |
| @@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq) | |||
| 698 | 711 | ||
| 699 | #if defined CONFIG_SMP | 712 | #if defined CONFIG_SMP |
| 700 | 713 | ||
| 701 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); | ||
| 702 | |||
| 703 | static inline int next_prio(struct rq *rq) | ||
| 704 | { | ||
| 705 | struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); | ||
| 706 | |||
| 707 | if (next && rt_prio(next->prio)) | ||
| 708 | return next->prio; | ||
| 709 | else | ||
| 710 | return MAX_RT_PRIO; | ||
| 711 | } | ||
| 712 | |||
| 713 | static void | 714 | static void |
| 714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | 715 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) |
| 715 | { | 716 | { |
| 716 | struct rq *rq = rq_of_rt_rq(rt_rq); | 717 | struct rq *rq = rq_of_rt_rq(rt_rq); |
| 717 | 718 | ||
| 718 | if (prio < prev_prio) { | 719 | if (rq->online && prio < prev_prio) |
| 719 | 720 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | |
| 720 | /* | ||
| 721 | * If the new task is higher in priority than anything on the | ||
| 722 | * run-queue, we know that the previous high becomes our | ||
| 723 | * next-highest. | ||
| 724 | */ | ||
| 725 | rt_rq->highest_prio.next = prev_prio; | ||
| 726 | |||
| 727 | if (rq->online) | ||
| 728 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | ||
| 729 | |||
| 730 | } else if (prio == rt_rq->highest_prio.curr) | ||
| 731 | /* | ||
| 732 | * If the next task is equal in priority to the highest on | ||
| 733 | * the run-queue, then we implicitly know that the next highest | ||
| 734 | * task cannot be any lower than current | ||
| 735 | */ | ||
| 736 | rt_rq->highest_prio.next = prio; | ||
| 737 | else if (prio < rt_rq->highest_prio.next) | ||
| 738 | /* | ||
| 739 | * Otherwise, we need to recompute next-highest | ||
| 740 | */ | ||
| 741 | rt_rq->highest_prio.next = next_prio(rq); | ||
| 742 | } | 721 | } |
| 743 | 722 | ||
| 744 | static void | 723 | static void |
| @@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
| 746 | { | 725 | { |
| 747 | struct rq *rq = rq_of_rt_rq(rt_rq); | 726 | struct rq *rq = rq_of_rt_rq(rt_rq); |
| 748 | 727 | ||
| 749 | if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) | ||
| 750 | rt_rq->highest_prio.next = next_prio(rq); | ||
| 751 | |||
| 752 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 728 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
| 753 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 729 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
| 754 | } | 730 | } |
| @@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 961 | 937 | ||
| 962 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 938 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
| 963 | enqueue_pushable_task(rq, p); | 939 | enqueue_pushable_task(rq, p); |
| 940 | |||
| 941 | inc_nr_running(rq); | ||
| 964 | } | 942 | } |
| 965 | 943 | ||
| 966 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 944 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
| @@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 971 | dequeue_rt_entity(rt_se); | 949 | dequeue_rt_entity(rt_se); |
| 972 | 950 | ||
| 973 | dequeue_pushable_task(rq, p); | 951 | dequeue_pushable_task(rq, p); |
| 952 | |||
| 953 | dec_nr_running(rq); | ||
| 974 | } | 954 | } |
| 975 | 955 | ||
| 976 | /* | 956 | /* |
| @@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1017 | struct rq *rq; | 997 | struct rq *rq; |
| 1018 | int cpu; | 998 | int cpu; |
| 1019 | 999 | ||
| 1020 | if (sd_flag != SD_BALANCE_WAKE) | ||
| 1021 | return smp_processor_id(); | ||
| 1022 | |||
| 1023 | cpu = task_cpu(p); | 1000 | cpu = task_cpu(p); |
| 1001 | |||
| 1002 | /* For anything but wake ups, just return the task_cpu */ | ||
| 1003 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
| 1004 | goto out; | ||
| 1005 | |||
| 1024 | rq = cpu_rq(cpu); | 1006 | rq = cpu_rq(cpu); |
| 1025 | 1007 | ||
| 1026 | rcu_read_lock(); | 1008 | rcu_read_lock(); |
| @@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1059 | } | 1041 | } |
| 1060 | rcu_read_unlock(); | 1042 | rcu_read_unlock(); |
| 1061 | 1043 | ||
| 1044 | out: | ||
| 1062 | return cpu; | 1045 | return cpu; |
| 1063 | } | 1046 | } |
| 1064 | 1047 | ||
| @@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
| 1178 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 1161 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
| 1179 | { | 1162 | { |
| 1180 | update_curr_rt(rq); | 1163 | update_curr_rt(rq); |
| 1181 | p->se.exec_start = 0; | ||
| 1182 | 1164 | ||
| 1183 | /* | 1165 | /* |
| 1184 | * The previous task needs to be made eligible for pushing | 1166 | * The previous task needs to be made eligible for pushing |
| @@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | |||
| 1198 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1180 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1199 | { | 1181 | { |
| 1200 | if (!task_running(rq, p) && | 1182 | if (!task_running(rq, p) && |
| 1201 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1183 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
| 1202 | (p->rt.nr_cpus_allowed > 1)) | 1184 | (p->rt.nr_cpus_allowed > 1)) |
| 1203 | return 1; | 1185 | return 1; |
| 1204 | return 0; | 1186 | return 0; |
| @@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1343 | */ | 1325 | */ |
| 1344 | if (unlikely(task_rq(task) != rq || | 1326 | if (unlikely(task_rq(task) != rq || |
| 1345 | !cpumask_test_cpu(lowest_rq->cpu, | 1327 | !cpumask_test_cpu(lowest_rq->cpu, |
| 1346 | &task->cpus_allowed) || | 1328 | tsk_cpus_allowed(task)) || |
| 1347 | task_running(rq, task) || | 1329 | task_running(rq, task) || |
| 1348 | !task->on_rq)) { | 1330 | !task->on_rq)) { |
| 1349 | 1331 | ||
| @@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq) | |||
| 1394 | { | 1376 | { |
| 1395 | struct task_struct *next_task; | 1377 | struct task_struct *next_task; |
| 1396 | struct rq *lowest_rq; | 1378 | struct rq *lowest_rq; |
| 1379 | int ret = 0; | ||
| 1397 | 1380 | ||
| 1398 | if (!rq->rt.overloaded) | 1381 | if (!rq->rt.overloaded) |
| 1399 | return 0; | 1382 | return 0; |
| @@ -1426,7 +1409,7 @@ retry: | |||
| 1426 | if (!lowest_rq) { | 1409 | if (!lowest_rq) { |
| 1427 | struct task_struct *task; | 1410 | struct task_struct *task; |
| 1428 | /* | 1411 | /* |
| 1429 | * find lock_lowest_rq releases rq->lock | 1412 | * find_lock_lowest_rq releases rq->lock |
| 1430 | * so it is possible that next_task has migrated. | 1413 | * so it is possible that next_task has migrated. |
| 1431 | * | 1414 | * |
| 1432 | * We need to make sure that the task is still on the same | 1415 | * We need to make sure that the task is still on the same |
| @@ -1436,12 +1419,11 @@ retry: | |||
| 1436 | task = pick_next_pushable_task(rq); | 1419 | task = pick_next_pushable_task(rq); |
| 1437 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1420 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
| 1438 | /* | 1421 | /* |
| 1439 | * If we get here, the task hasn't moved at all, but | 1422 | * The task hasn't migrated, and is still the next |
| 1440 | * it has failed to push. We will not try again, | 1423 | * eligible task, but we failed to find a run-queue |
| 1441 | * since the other cpus will pull from us when they | 1424 | * to push it to. Do not retry in this case, since |
| 1442 | * are ready. | 1425 | * other cpus will pull from us when ready. |
| 1443 | */ | 1426 | */ |
| 1444 | dequeue_pushable_task(rq, next_task); | ||
| 1445 | goto out; | 1427 | goto out; |
| 1446 | } | 1428 | } |
| 1447 | 1429 | ||
| @@ -1460,6 +1442,7 @@ retry: | |||
| 1460 | deactivate_task(rq, next_task, 0); | 1442 | deactivate_task(rq, next_task, 0); |
| 1461 | set_task_cpu(next_task, lowest_rq->cpu); | 1443 | set_task_cpu(next_task, lowest_rq->cpu); |
| 1462 | activate_task(lowest_rq, next_task, 0); | 1444 | activate_task(lowest_rq, next_task, 0); |
| 1445 | ret = 1; | ||
| 1463 | 1446 | ||
| 1464 | resched_task(lowest_rq->curr); | 1447 | resched_task(lowest_rq->curr); |
| 1465 | 1448 | ||
| @@ -1468,7 +1451,7 @@ retry: | |||
| 1468 | out: | 1451 | out: |
| 1469 | put_task_struct(next_task); | 1452 | put_task_struct(next_task); |
| 1470 | 1453 | ||
| 1471 | return 1; | 1454 | return ret; |
| 1472 | } | 1455 | } |
| 1473 | 1456 | ||
| 1474 | static void push_rt_tasks(struct rq *rq) | 1457 | static void push_rt_tasks(struct rq *rq) |
| @@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1626 | 1609 | ||
| 1627 | update_rt_migration(&rq->rt); | 1610 | update_rt_migration(&rq->rt); |
| 1628 | } | 1611 | } |
| 1629 | |||
| 1630 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
| 1631 | p->rt.nr_cpus_allowed = weight; | ||
| 1632 | } | 1612 | } |
| 1633 | 1613 | ||
| 1634 | /* Assumes rq->lock is held */ | 1614 | /* Assumes rq->lock is held */ |
| @@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) | |||
| 1863 | rcu_read_unlock(); | 1843 | rcu_read_unlock(); |
| 1864 | } | 1844 | } |
| 1865 | #endif /* CONFIG_SCHED_DEBUG */ | 1845 | #endif /* CONFIG_SCHED_DEBUG */ |
| 1866 | |||
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
| @@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
| 34 | static void | 34 | static void |
| 35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
| 36 | { | 36 | { |
| 37 | inc_nr_running(rq); | ||
| 37 | } | 38 | } |
| 38 | 39 | ||
| 39 | static void | 40 | static void |
| 40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
| 41 | { | 42 | { |
| 43 | dec_nr_running(rq); | ||
| 42 | } | 44 | } |
| 43 | 45 | ||
| 44 | static void yield_task_stop(struct rq *rq) | 46 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { | |||
| 379 | .extra2 = &one, | 379 | .extra2 = &one, |
| 380 | }, | 380 | }, |
| 381 | #endif | 381 | #endif |
| 382 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 383 | { | ||
| 384 | .procname = "sched_cfs_bandwidth_slice_us", | ||
| 385 | .data = &sysctl_sched_cfs_bandwidth_slice, | ||
| 386 | .maxlen = sizeof(unsigned int), | ||
| 387 | .mode = 0644, | ||
| 388 | .proc_handler = proc_dointvec_minmax, | ||
| 389 | .extra1 = &one, | ||
| 390 | }, | ||
| 391 | #endif | ||
| 382 | #ifdef CONFIG_PROVE_LOCKING | 392 | #ifdef CONFIG_PROVE_LOCKING |
| 383 | { | 393 | { |
| 384 | .procname = "prove_locking", | 394 | .procname = "prove_locking", |
diff --git a/lib/Kconfig b/lib/Kconfig index 6c695ff9caba..32f3e5ae2be5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig | |||
| @@ -276,7 +276,4 @@ config CORDIC | |||
| 276 | so its calculations are in fixed point. Modules can select this | 276 | so its calculations are in fixed point. Modules can select this |
| 277 | when they require this function. Module will be called cordic. | 277 | when they require this function. Module will be called cordic. |
| 278 | 278 | ||
| 279 | config LLIST | ||
| 280 | bool | ||
| 281 | |||
| 282 | endmenu | 279 | endmenu |
diff --git a/lib/Makefile b/lib/Makefile index 3f5bc6d903e0..a4da283f5dc0 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
| @@ -22,7 +22,7 @@ lib-y += kobject.o kref.o klist.o | |||
| 22 | obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ | 22 | obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ |
| 23 | bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ | 23 | bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ |
| 24 | string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ | 24 | string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ |
| 25 | bsearch.o find_last_bit.o find_next_bit.o | 25 | bsearch.o find_last_bit.o find_next_bit.o llist.o |
| 26 | obj-y += kstrtox.o | 26 | obj-y += kstrtox.o |
| 27 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o | 27 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o |
| 28 | 28 | ||
| @@ -115,8 +115,6 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o | |||
| 115 | 115 | ||
| 116 | obj-$(CONFIG_CORDIC) += cordic.o | 116 | obj-$(CONFIG_CORDIC) += cordic.o |
| 117 | 117 | ||
| 118 | obj-$(CONFIG_LLIST) += llist.o | ||
| 119 | |||
| 120 | hostprogs-y := gen_crc32table | 118 | hostprogs-y := gen_crc32table |
| 121 | clean-files := crc32table.h | 119 | clean-files := crc32table.h |
| 122 | 120 | ||
diff --git a/lib/llist.c b/lib/llist.c index da445724fa1f..700cff77a387 100644 --- a/lib/llist.c +++ b/lib/llist.c | |||
| @@ -3,8 +3,8 @@ | |||
| 3 | * | 3 | * |
| 4 | * The basic atomic operation of this list is cmpxchg on long. On | 4 | * The basic atomic operation of this list is cmpxchg on long. On |
| 5 | * architectures that don't have NMI-safe cmpxchg implementation, the | 5 | * architectures that don't have NMI-safe cmpxchg implementation, the |
| 6 | * list can NOT be used in NMI handler. So code uses the list in NMI | 6 | * list can NOT be used in NMI handlers. So code that uses the list in |
| 7 | * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. | 7 | * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. |
| 8 | * | 8 | * |
| 9 | * Copyright 2010,2011 Intel Corp. | 9 | * Copyright 2010,2011 Intel Corp. |
| 10 | * Author: Huang Ying <ying.huang@intel.com> | 10 | * Author: Huang Ying <ying.huang@intel.com> |
| @@ -30,48 +30,28 @@ | |||
| 30 | #include <asm/system.h> | 30 | #include <asm/system.h> |
| 31 | 31 | ||
| 32 | /** | 32 | /** |
| 33 | * llist_add - add a new entry | ||
| 34 | * @new: new entry to be added | ||
| 35 | * @head: the head for your lock-less list | ||
| 36 | */ | ||
| 37 | void llist_add(struct llist_node *new, struct llist_head *head) | ||
| 38 | { | ||
| 39 | struct llist_node *entry, *old_entry; | ||
| 40 | |||
| 41 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
| 42 | BUG_ON(in_nmi()); | ||
| 43 | #endif | ||
| 44 | |||
| 45 | entry = head->first; | ||
| 46 | do { | ||
| 47 | old_entry = entry; | ||
| 48 | new->next = entry; | ||
| 49 | cpu_relax(); | ||
| 50 | } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry); | ||
| 51 | } | ||
| 52 | EXPORT_SYMBOL_GPL(llist_add); | ||
| 53 | |||
| 54 | /** | ||
| 55 | * llist_add_batch - add several linked entries in batch | 33 | * llist_add_batch - add several linked entries in batch |
| 56 | * @new_first: first entry in batch to be added | 34 | * @new_first: first entry in batch to be added |
| 57 | * @new_last: last entry in batch to be added | 35 | * @new_last: last entry in batch to be added |
| 58 | * @head: the head for your lock-less list | 36 | * @head: the head for your lock-less list |
| 37 | * | ||
| 38 | * Return whether list is empty before adding. | ||
| 59 | */ | 39 | */ |
| 60 | void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, | 40 | bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, |
| 61 | struct llist_head *head) | 41 | struct llist_head *head) |
| 62 | { | 42 | { |
| 63 | struct llist_node *entry, *old_entry; | 43 | struct llist_node *entry, *old_entry; |
| 64 | 44 | ||
| 65 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
| 66 | BUG_ON(in_nmi()); | ||
| 67 | #endif | ||
| 68 | |||
| 69 | entry = head->first; | 45 | entry = head->first; |
| 70 | do { | 46 | for (;;) { |
| 71 | old_entry = entry; | 47 | old_entry = entry; |
| 72 | new_last->next = entry; | 48 | new_last->next = entry; |
| 73 | cpu_relax(); | 49 | entry = cmpxchg(&head->first, old_entry, new_first); |
| 74 | } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry); | 50 | if (entry == old_entry) |
| 51 | break; | ||
| 52 | } | ||
| 53 | |||
| 54 | return old_entry == NULL; | ||
| 75 | } | 55 | } |
| 76 | EXPORT_SYMBOL_GPL(llist_add_batch); | 56 | EXPORT_SYMBOL_GPL(llist_add_batch); |
| 77 | 57 | ||
| @@ -93,37 +73,17 @@ struct llist_node *llist_del_first(struct llist_head *head) | |||
| 93 | { | 73 | { |
| 94 | struct llist_node *entry, *old_entry, *next; | 74 | struct llist_node *entry, *old_entry, *next; |
| 95 | 75 | ||
| 96 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
| 97 | BUG_ON(in_nmi()); | ||
| 98 | #endif | ||
| 99 | |||
| 100 | entry = head->first; | 76 | entry = head->first; |
| 101 | do { | 77 | for (;;) { |
| 102 | if (entry == NULL) | 78 | if (entry == NULL) |
| 103 | return NULL; | 79 | return NULL; |
| 104 | old_entry = entry; | 80 | old_entry = entry; |
| 105 | next = entry->next; | 81 | next = entry->next; |
| 106 | cpu_relax(); | 82 | entry = cmpxchg(&head->first, old_entry, next); |
| 107 | } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry); | 83 | if (entry == old_entry) |
| 84 | break; | ||
| 85 | } | ||
| 108 | 86 | ||
| 109 | return entry; | 87 | return entry; |
| 110 | } | 88 | } |
| 111 | EXPORT_SYMBOL_GPL(llist_del_first); | 89 | EXPORT_SYMBOL_GPL(llist_del_first); |
| 112 | |||
| 113 | /** | ||
| 114 | * llist_del_all - delete all entries from lock-less list | ||
| 115 | * @head: the head of lock-less list to delete all entries | ||
| 116 | * | ||
| 117 | * If list is empty, return NULL, otherwise, delete all entries and | ||
| 118 | * return the pointer to the first entry. The order of entries | ||
| 119 | * deleted is from the newest to the oldest added one. | ||
| 120 | */ | ||
| 121 | struct llist_node *llist_del_all(struct llist_head *head) | ||
| 122 | { | ||
| 123 | #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG | ||
| 124 | BUG_ON(in_nmi()); | ||
| 125 | #endif | ||
| 126 | |||
| 127 | return xchg(&head->first, NULL); | ||
| 128 | } | ||
| 129 | EXPORT_SYMBOL_GPL(llist_del_all); | ||
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4689cb073da4..503f087382a4 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c | |||
| @@ -22,7 +22,7 @@ notrace unsigned int debug_smp_processor_id(void) | |||
| 22 | * Kernel threads bound to a single CPU can safely use | 22 | * Kernel threads bound to a single CPU can safely use |
| 23 | * smp_processor_id(): | 23 | * smp_processor_id(): |
| 24 | */ | 24 | */ |
| 25 | if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) | 25 | if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu))) |
| 26 | goto out; | 26 | goto out; |
| 27 | 27 | ||
| 28 | /* | 28 | /* |
